{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11232.0, "completions/mean_length": 2382.95703125, "completions/mean_terminated_length": 1960.3902587890625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.2613159120082855, "epoch": 0.002631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.0321967713534832, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 1623882.0, "reward": 0.319697767496109, "reward_std": 0.3174176812171936, "rewards/progression_diversity/mean": -0.0028816750273108482, "rewards/progression_diversity/std": 0.03537697717547417, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.5423176884651184, "rewards/symbolic_reward_partial_score/std": 0.333414226770401, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0534241199493408, "sampling/importance_sampling_ratio/min": 2.7432516327974277e-11, "sampling/sampling_logp_difference/max": 24.319292068481445, "sampling/sampling_logp_difference/mean": 0.10465320944786072, "step": 1 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2668819725513458, "epoch": 0.005263157894736842, "grad_norm": 0.02215813286602497, "learning_rate": 1e-06, "loss": 0.0095, "step": 2 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.25824375450611115, "epoch": 0.007894736842105263, "grad_norm": 0.02271086722612381, "learning_rate": 1e-06, "loss": 0.0381, "step": 3 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.25850287079811096, "epoch": 0.010526315789473684, "grad_norm": 0.030293075367808342, "learning_rate": 1e-06, "loss": 0.0528, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13193.0, "completions/mean_length": 2588.431640625, "completions/mean_terminated_length": 2027.6361083984375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.2614176720380783, "epoch": 0.013157894736842105, "frac_reward_zero_std": 0.0, "grad_norm": 0.030042776837944984, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 3343623.0, "reward": 0.38939642906188965, "reward_std": 0.3555682599544525, "rewards/progression_diversity/mean": -0.0007864796789363027, "rewards/progression_diversity/std": 0.008491357788443565, "rewards/symbolic_reward_accuracy/mean": 0.35546875, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.5968424081802368, "rewards/symbolic_reward_partial_score/std": 0.35041725635528564, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.051404595375061, "sampling/importance_sampling_ratio/min": 0.00030985596822574735, "sampling/sampling_logp_difference/max": 8.079402923583984, "sampling/sampling_logp_difference/mean": 0.09999503940343857, "step": 5 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2622606158256531, "epoch": 0.015789473684210527, "grad_norm": 0.02478160709142685, "learning_rate": 1e-06, "loss": 0.0686, "step": 6 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2552778720855713, "epoch": 0.018421052631578946, "grad_norm": 0.02954026311635971, "learning_rate": 1e-06, "loss": 0.0668, "step": 7 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.25751645863056183, "epoch": 0.021052631578947368, "grad_norm": 0.026725415140390396, "learning_rate": 1e-06, "loss": 0.0219, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12436.0, "completions/mean_length": 2186.072265625, "completions/mean_terminated_length": 1816.1864013671875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.2712424546480179, "epoch": 0.02368421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 0.032029978930950165, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 4856012.0, "reward": 0.4238227605819702, "reward_std": 0.36550843715667725, "rewards/progression_diversity/mean": -0.0005370433209463954, "rewards/progression_diversity/std": 0.007207411807030439, "rewards/symbolic_reward_accuracy/mean": 0.3984375, "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, "rewards/symbolic_reward_partial_score/mean": 0.6223958134651184, "rewards/symbolic_reward_partial_score/std": 0.3536975085735321, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.055936574935913, "sampling/importance_sampling_ratio/min": 0.00310147344134748, "sampling/sampling_logp_difference/max": 5.775877952575684, "sampling/sampling_logp_difference/mean": 0.10886339843273163, "step": 9 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2743559181690216, "epoch": 0.02631578947368421, "grad_norm": 0.021439263597130775, "learning_rate": 1e-06, "loss": 0.032, "step": 10 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2593652904033661, "epoch": 0.02894736842105263, "grad_norm": 0.030393775552511215, "learning_rate": 1e-06, "loss": 0.0474, "step": 11 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2632688581943512, "epoch": 0.031578947368421054, "grad_norm": 0.023602856323122978, "learning_rate": 1e-06, "loss": 0.0239, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13597.0, "completions/mean_length": 2401.51953125, "completions/mean_terminated_length": 2094.51904296875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.2705395370721817, "epoch": 0.034210526315789476, "frac_reward_zero_std": 0.0, "grad_norm": 0.027274658903479576, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 6479734.0, "reward": 0.4843239188194275, "reward_std": 0.356700599193573, "rewards/progression_diversity/mean": -0.00022850481036584824, "rewards/progression_diversity/std": 0.0038324107881635427, "rewards/symbolic_reward_accuracy/mean": 0.482421875, "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, "rewards/symbolic_reward_partial_score/mean": 0.6554361581802368, "rewards/symbolic_reward_partial_score/std": 0.37485411763191223, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0567376613616943, "sampling/importance_sampling_ratio/min": 0.004240815062075853, "sampling/sampling_logp_difference/max": 5.4629998207092285, "sampling/sampling_logp_difference/mean": 0.11016078293323517, "step": 13 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2603074461221695, "epoch": 0.03684210526315789, "grad_norm": 0.03649430721998215, "learning_rate": 1e-06, "loss": 0.0331, "step": 14 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2602958679199219, "epoch": 0.039473684210526314, "grad_norm": 0.019655603915452957, "learning_rate": 1e-06, "loss": 0.0592, "step": 15 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2742217630147934, "epoch": 0.042105263157894736, "grad_norm": 0.019776981323957443, "learning_rate": 1e-06, "loss": 0.0364, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13346.0, "completions/mean_length": 2445.453125, "completions/mean_terminated_length": 1966.755615234375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.2584807947278023, "epoch": 0.04473684210526316, "frac_reward_zero_std": 0.03125, "grad_norm": 0.04284660518169403, "learning_rate": 1e-06, "loss": 0.0521, "num_tokens": 8138942.0, "reward": 0.5522767305374146, "reward_std": 0.32186436653137207, "rewards/progression_diversity/mean": -0.00181739148683846, "rewards/progression_diversity/std": 0.018342694267630577, "rewards/symbolic_reward_accuracy/mean": 0.568359375, "rewards/symbolic_reward_accuracy/std": 0.4957893490791321, "rewards/symbolic_reward_partial_score/mean": 0.71337890625, "rewards/symbolic_reward_partial_score/std": 0.37059590220451355, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.053914189338684, "sampling/importance_sampling_ratio/min": 0.004635987337678671, "sampling/sampling_logp_difference/max": 5.373906135559082, "sampling/sampling_logp_difference/mean": 0.10624644160270691, "step": 17 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.26499390602111816, "epoch": 0.04736842105263158, "grad_norm": 0.02529755048453808, "learning_rate": 1e-06, "loss": 0.038, "step": 18 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.26124098896980286, "epoch": 0.05, "grad_norm": 0.027479395270347595, "learning_rate": 1e-06, "loss": 0.0462, "step": 19 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2685445100069046, "epoch": 0.05263157894736842, "grad_norm": 0.02501635067164898, "learning_rate": 1e-06, "loss": 0.0469, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14541.0, "completions/mean_length": 2491.662109375, "completions/mean_terminated_length": 2101.1142578125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.25292401015758514, "epoch": 0.05526315789473684, "frac_reward_zero_std": 0.09375, "grad_norm": 0.03069983422756195, "learning_rate": 1e-06, "loss": 0.0885, "num_tokens": 9818961.0, "reward": 0.5678541660308838, "reward_std": 0.30515754222869873, "rewards/progression_diversity/mean": -0.0016935247695073485, "rewards/progression_diversity/std": 0.024840721860527992, "rewards/symbolic_reward_accuracy/mean": 0.591796875, "rewards/symbolic_reward_accuracy/std": 0.49198177456855774, "rewards/symbolic_reward_partial_score/mean": 0.7164713144302368, "rewards/symbolic_reward_partial_score/std": 0.38369399309158325, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0556615591049194, "sampling/importance_sampling_ratio/min": 3.9937659049577425e-16, "sampling/sampling_logp_difference/max": 35.456626892089844, "sampling/sampling_logp_difference/mean": 0.10836475342512131, "step": 21 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2719355523586273, "epoch": 0.05789473684210526, "grad_norm": 0.02567470446228981, "learning_rate": 1e-06, "loss": 0.0034, "step": 22 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2798517644405365, "epoch": 0.060526315789473685, "grad_norm": 0.01759127527475357, "learning_rate": 1e-06, "loss": 0.0207, "step": 23 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.26892197132110596, "epoch": 0.06315789473684211, "grad_norm": 0.029061393812298775, "learning_rate": 1e-06, "loss": 0.0581, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15759.0, "completions/mean_length": 2611.63671875, "completions/mean_terminated_length": 2167.366943359375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.2623217850923538, "epoch": 0.06578947368421052, "frac_reward_zero_std": 0.0, "grad_norm": 0.04489253833889961, "learning_rate": 1e-06, "loss": 0.0677, "num_tokens": 11567607.0, "reward": 0.6099430322647095, "reward_std": 0.3489864468574524, "rewards/progression_diversity/mean": -0.0017921316903084517, "rewards/progression_diversity/std": 0.023787712678313255, "rewards/symbolic_reward_accuracy/mean": 0.642578125, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.755859375, "rewards/symbolic_reward_partial_score/std": 0.3704371750354767, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0542800426483154, "sampling/importance_sampling_ratio/min": 0.00033611588878557086, "sampling/sampling_logp_difference/max": 7.998054504394531, "sampling/sampling_logp_difference/mean": 0.10523411631584167, "step": 25 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2667793333530426, "epoch": 0.06842105263157895, "grad_norm": 0.042507365345954895, "learning_rate": 1e-06, "loss": 0.042, "step": 26 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2573726028203964, "epoch": 0.07105263157894737, "grad_norm": 0.03612668439745903, "learning_rate": 1e-06, "loss": 0.0948, "step": 27 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2717868983745575, "epoch": 0.07368421052631578, "grad_norm": 0.03351793438196182, "learning_rate": 1e-06, "loss": 0.0522, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15168.0, "completions/mean_length": 2566.255859375, "completions/mean_terminated_length": 2062.775390625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.2697293758392334, "epoch": 0.07631578947368421, "frac_reward_zero_std": 0.15625, "grad_norm": 0.036783039569854736, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 13282330.0, "reward": 0.6279209852218628, "reward_std": 0.30352213978767395, "rewards/progression_diversity/mean": -0.0008747372776269913, "rewards/progression_diversity/std": 0.01144934818148613, "rewards/symbolic_reward_accuracy/mean": 0.666015625, "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, "rewards/symbolic_reward_partial_score/mean": 0.771484375, "rewards/symbolic_reward_partial_score/std": 0.36460402607917786, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0549585819244385, "sampling/importance_sampling_ratio/min": 7.974162144819275e-05, "sampling/sampling_logp_difference/max": 9.436718940734863, "sampling/sampling_logp_difference/mean": 0.10604645311832428, "step": 29 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2652689218521118, "epoch": 0.07894736842105263, "grad_norm": 0.031599096953868866, "learning_rate": 1e-06, "loss": 0.0545, "step": 30 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2544643208384514, "epoch": 0.08157894736842106, "grad_norm": 0.03158700093626976, "learning_rate": 1e-06, "loss": 0.1139, "step": 31 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.27170825004577637, "epoch": 0.08421052631578947, "grad_norm": 0.026205774396657944, "learning_rate": 1e-06, "loss": 0.0488, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12587.0, "completions/mean_length": 2206.931640625, "completions/mean_terminated_length": 1895.65869140625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.26374557614326477, "epoch": 0.0868421052631579, "frac_reward_zero_std": 0.1875, "grad_norm": 0.02841874025762081, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 14809751.0, "reward": 0.6377917528152466, "reward_std": 0.28174924850463867, "rewards/progression_diversity/mean": -0.00011936978262383491, "rewards/progression_diversity/std": 0.0016766022890806198, "rewards/symbolic_reward_accuracy/mean": 0.677734375, "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, "rewards/symbolic_reward_partial_score/mean": 0.7744140625, "rewards/symbolic_reward_partial_score/std": 0.37388041615486145, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0545893907546997, "sampling/importance_sampling_ratio/min": 0.0025524995289742947, "sampling/sampling_logp_difference/max": 5.970682144165039, "sampling/sampling_logp_difference/mean": 0.1066381186246872, "step": 33 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2746908664703369, "epoch": 0.08947368421052632, "grad_norm": 0.033538322895765305, "learning_rate": 1e-06, "loss": 0.0432, "step": 34 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2674672603607178, "epoch": 0.09210526315789473, "grad_norm": 0.02931833639740944, "learning_rate": 1e-06, "loss": 0.0377, "step": 35 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2641746252775192, "epoch": 0.09473684210526316, "grad_norm": 0.038021378219127655, "learning_rate": 1e-06, "loss": 0.0341, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11568.0, "completions/mean_length": 2400.705078125, "completions/mean_terminated_length": 1949.6309814453125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.27227024734020233, "epoch": 0.09736842105263158, "frac_reward_zero_std": 0.25, "grad_norm": 0.024732865393161774, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 16416512.0, "reward": 0.6867997050285339, "reward_std": 0.2583235502243042, "rewards/progression_diversity/mean": -0.0016752530355006456, "rewards/progression_diversity/std": 0.021188421174883842, "rewards/symbolic_reward_accuracy/mean": 0.744140625, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.8095703125, "rewards/symbolic_reward_partial_score/std": 0.3585224747657776, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0554531812667847, "sampling/importance_sampling_ratio/min": 1.577901821292471e-05, "sampling/sampling_logp_difference/max": 11.056829452514648, "sampling/sampling_logp_difference/mean": 0.10788406431674957, "step": 37 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.27088363468647003, "epoch": 0.1, "grad_norm": 0.0261248666793108, "learning_rate": 1e-06, "loss": 0.0765, "step": 38 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.26246321201324463, "epoch": 0.10263157894736842, "grad_norm": 0.019391866400837898, "learning_rate": 1e-06, "loss": 0.0742, "step": 39 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2687445729970932, "epoch": 0.10526315789473684, "grad_norm": 0.02644912153482437, "learning_rate": 1e-06, "loss": 0.0445, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14635.0, "completions/mean_length": 2510.658203125, "completions/mean_terminated_length": 2091.945556640625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.26613570749759674, "epoch": 0.10789473684210527, "frac_reward_zero_std": 0.28125, "grad_norm": 0.0331517830491066, "learning_rate": 1e-06, "loss": 0.0651, "num_tokens": 18096273.0, "reward": 0.6728851199150085, "reward_std": 0.25044363737106323, "rewards/progression_diversity/mean": -0.0015264635439962149, "rewards/progression_diversity/std": 0.01903173141181469, "rewards/symbolic_reward_accuracy/mean": 0.72265625, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.8055012822151184, "rewards/symbolic_reward_partial_score/std": 0.3548940122127533, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.055107831954956, "sampling/importance_sampling_ratio/min": 9.60548914008541e-06, "sampling/sampling_logp_difference/max": 11.553175926208496, "sampling/sampling_logp_difference/mean": 0.10785190761089325, "step": 41 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.26286329329013824, "epoch": 0.11052631578947368, "grad_norm": 0.029875140637159348, "learning_rate": 1e-06, "loss": 0.0671, "step": 42 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.27232979238033295, "epoch": 0.11315789473684211, "grad_norm": 0.026524005457758904, "learning_rate": 1e-06, "loss": 0.0335, "step": 43 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.26245296001434326, "epoch": 0.11578947368421053, "grad_norm": 0.034007180482149124, "learning_rate": 1e-06, "loss": 0.0868, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10722.0, "completions/mean_length": 2233.7578125, "completions/mean_terminated_length": 1894.152099609375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.2722453474998474, "epoch": 0.11842105263157894, "frac_reward_zero_std": 0.09375, "grad_norm": 0.057794682681560516, "learning_rate": 1e-06, "loss": 0.0629, "num_tokens": 19646613.0, "reward": 0.6934746503829956, "reward_std": 0.30955323576927185, "rewards/progression_diversity/mean": -0.0031212307512760162, "rewards/progression_diversity/std": 0.02668512612581253, "rewards/symbolic_reward_accuracy/mean": 0.75, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.8194986581802368, "rewards/symbolic_reward_partial_score/std": 0.3468964397907257, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0547754764556885, "sampling/importance_sampling_ratio/min": 0.002088946523144841, "sampling/sampling_logp_difference/max": 6.171095371246338, "sampling/sampling_logp_difference/mean": 0.10662943124771118, "step": 45 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2671891450881958, "epoch": 0.12105263157894737, "grad_norm": 0.022444499656558037, "learning_rate": 1e-06, "loss": 0.0136, "step": 46 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.25909220427274704, "epoch": 0.12368421052631579, "grad_norm": 0.023088127374649048, "learning_rate": 1e-06, "loss": 0.0754, "step": 47 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.26286621391773224, "epoch": 0.12631578947368421, "grad_norm": 0.026823926717042923, "learning_rate": 1e-06, "loss": 0.0538, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11726.0, "completions/mean_length": 2254.619140625, "completions/mean_terminated_length": 1769.3677978515625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "entropy": 0.2699515223503113, "epoch": 0.12894736842105264, "frac_reward_zero_std": 0.125, "grad_norm": 0.02539858967065811, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 21198770.0, "reward": 0.674487829208374, "reward_std": 0.2851996421813965, "rewards/progression_diversity/mean": -0.0023922312539070845, "rewards/progression_diversity/std": 0.024297581985592842, "rewards/symbolic_reward_accuracy/mean": 0.728515625, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.8011067509651184, "rewards/symbolic_reward_partial_score/std": 0.36674949526786804, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0519602298736572, "sampling/importance_sampling_ratio/min": 5.275764829005907e-16, "sampling/sampling_logp_difference/max": 35.17823791503906, "sampling/sampling_logp_difference/mean": 0.1013394445180893, "step": 49 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.25432244688272476, "epoch": 0.13157894736842105, "grad_norm": 0.021180639043450356, "learning_rate": 1e-06, "loss": 0.0514, "step": 50 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2609899342060089, "epoch": 0.13421052631578947, "grad_norm": 0.021488042548298836, "learning_rate": 1e-06, "loss": 0.097, "step": 51 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24922076612710953, "epoch": 0.1368421052631579, "grad_norm": 0.027309391647577286, "learning_rate": 1e-06, "loss": 0.1207, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16107.0, "completions/mean_length": 2299.81640625, "completions/mean_terminated_length": 1845.48779296875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.26104749739170074, "epoch": 0.1394736842105263, "frac_reward_zero_std": 0.1875, "grad_norm": 0.028313223272562027, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 22783412.0, "reward": 0.6890961527824402, "reward_std": 0.2640804052352905, "rewards/progression_diversity/mean": -0.0015217037871479988, "rewards/progression_diversity/std": 0.01678471639752388, "rewards/symbolic_reward_accuracy/mean": 0.740234375, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.82568359375, "rewards/symbolic_reward_partial_score/std": 0.3356630504131317, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052910566329956, "sampling/importance_sampling_ratio/min": 7.863413884479087e-07, "sampling/sampling_logp_difference/max": 14.055874824523926, "sampling/sampling_logp_difference/mean": 0.10275271534919739, "step": 53 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26632043719291687, "epoch": 0.14210526315789473, "grad_norm": 0.029039481654763222, "learning_rate": 1e-06, "loss": 0.0791, "step": 54 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2686396688222885, "epoch": 0.14473684210526316, "grad_norm": 0.028012612834572792, "learning_rate": 1e-06, "loss": 0.0624, "step": 55 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.25890131294727325, "epoch": 0.14736842105263157, "grad_norm": 0.025674991309642792, "learning_rate": 1e-06, "loss": 0.086, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12780.0, "completions/mean_length": 2413.14453125, "completions/mean_terminated_length": 1874.7139892578125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.25308099389076233, "epoch": 0.15, "frac_reward_zero_std": 0.15625, "grad_norm": 0.04013952985405922, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 24430270.0, "reward": 0.6752703189849854, "reward_std": 0.308138370513916, "rewards/progression_diversity/mean": -0.002267459873110056, "rewards/progression_diversity/std": 0.02433175779879093, "rewards/symbolic_reward_accuracy/mean": 0.7265625, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.8076171875, "rewards/symbolic_reward_partial_score/std": 0.35207507014274597, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0503523349761963, "sampling/importance_sampling_ratio/min": 7.612612193952373e-07, "sampling/sampling_logp_difference/max": 14.088289260864258, "sampling/sampling_logp_difference/mean": 0.09810857474803925, "step": 57 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2614179700613022, "epoch": 0.15263157894736842, "grad_norm": 0.03701889514923096, "learning_rate": 1e-06, "loss": 0.0238, "step": 58 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.24882755428552628, "epoch": 0.15526315789473685, "grad_norm": 0.0272480770945549, "learning_rate": 1e-06, "loss": 0.1082, "step": 59 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.25699031352996826, "epoch": 0.15789473684210525, "grad_norm": 0.03054182417690754, "learning_rate": 1e-06, "loss": 0.0537, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16104.0, "completions/mean_length": 2519.41796875, "completions/mean_terminated_length": 1955.8170166015625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.2684636861085892, "epoch": 0.16052631578947368, "frac_reward_zero_std": 0.25, "grad_norm": 0.028775321319699287, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 26120852.0, "reward": 0.7042874693870544, "reward_std": 0.26989448070526123, "rewards/progression_diversity/mean": -0.0009394375374540687, "rewards/progression_diversity/std": 0.01054183766245842, "rewards/symbolic_reward_accuracy/mean": 0.767578125, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.8235676884651184, "rewards/symbolic_reward_partial_score/std": 0.3544165790081024, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0519888401031494, "sampling/importance_sampling_ratio/min": 0.0002099307457683608, "sampling/sampling_logp_difference/max": 8.468732833862305, "sampling/sampling_logp_difference/mean": 0.10232645273208618, "step": 61 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.25643981993198395, "epoch": 0.1631578947368421, "grad_norm": 0.03552790358662605, "learning_rate": 1e-06, "loss": 0.1004, "step": 62 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2648386210203171, "epoch": 0.16578947368421051, "grad_norm": 0.032785214483737946, "learning_rate": 1e-06, "loss": 0.065, "step": 63 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.25679877400398254, "epoch": 0.16842105263157894, "grad_norm": 0.027879441156983376, "learning_rate": 1e-06, "loss": 0.0989, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14055.0, "completions/mean_length": 3021.4609375, "completions/mean_terminated_length": 2160.2578125, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.2547323405742645, "epoch": 0.17105263157894737, "frac_reward_zero_std": 0.15625, "grad_norm": 0.036476247012615204, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 28101184.0, "reward": 0.6189697980880737, "reward_std": 0.3368384838104248, "rewards/progression_diversity/mean": -0.002434882801026106, "rewards/progression_diversity/std": 0.020372329279780388, "rewards/symbolic_reward_accuracy/mean": 0.66796875, "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, "rewards/symbolic_reward_partial_score/mean": 0.7449544072151184, "rewards/symbolic_reward_partial_score/std": 0.4009448289871216, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049661636352539, "sampling/importance_sampling_ratio/min": 8.808132175019967e-13, "sampling/sampling_logp_difference/max": 27.757930755615234, "sampling/sampling_logp_difference/mean": 0.0967579260468483, "step": 65 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24418476969003677, "epoch": 0.1736842105263158, "grad_norm": 0.02933136560022831, "learning_rate": 1e-06, "loss": 0.0656, "step": 66 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2377062812447548, "epoch": 0.1763157894736842, "grad_norm": 0.028272006660699844, "learning_rate": 1e-06, "loss": 0.1092, "step": 67 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.251356840133667, "epoch": 0.17894736842105263, "grad_norm": 0.03293720260262489, "learning_rate": 1e-06, "loss": 0.0516, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11239.0, "completions/mean_length": 2362.73046875, "completions/mean_terminated_length": 1822.35693359375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.2604978382587433, "epoch": 0.18157894736842106, "frac_reward_zero_std": 0.1875, "grad_norm": 0.031264252960681915, "learning_rate": 1e-06, "loss": 0.0868, "num_tokens": 29700854.0, "reward": 0.7232824563980103, "reward_std": 0.27414870262145996, "rewards/progression_diversity/mean": -0.0008577151456847787, "rewards/progression_diversity/std": 0.009135694243013859, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.85498046875, "rewards/symbolic_reward_partial_score/std": 0.3152819871902466, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0526225566864014, "sampling/importance_sampling_ratio/min": 1.6764071233410505e-06, "sampling/sampling_logp_difference/max": 13.298857688903809, "sampling/sampling_logp_difference/mean": 0.10294780135154724, "step": 69 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2632171958684921, "epoch": 0.18421052631578946, "grad_norm": 0.0390724278986454, "learning_rate": 1e-06, "loss": 0.0445, "step": 70 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2604610174894333, "epoch": 0.1868421052631579, "grad_norm": 0.03644363954663277, "learning_rate": 1e-06, "loss": 0.1041, "step": 71 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2630729079246521, "epoch": 0.18947368421052632, "grad_norm": 0.02937830239534378, "learning_rate": 1e-06, "loss": 0.0363, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12593.0, "completions/mean_length": 2441.689453125, "completions/mean_terminated_length": 1845.37890625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.2574312090873718, "epoch": 0.19210526315789472, "frac_reward_zero_std": 0.28125, "grad_norm": 0.025905072689056396, "learning_rate": 1e-06, "loss": 0.0491, "num_tokens": 31351639.0, "reward": 0.724668025970459, "reward_std": 0.24668556451797485, "rewards/progression_diversity/mean": -0.003901800373569131, "rewards/progression_diversity/std": 0.030434370040893555, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.8499348759651184, "rewards/symbolic_reward_partial_score/std": 0.3279423713684082, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0521684885025024, "sampling/importance_sampling_ratio/min": 1.6198048111149e-11, "sampling/sampling_logp_difference/max": 24.84613037109375, "sampling/sampling_logp_difference/mean": 0.10166233777999878, "step": 73 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2581082284450531, "epoch": 0.19473684210526315, "grad_norm": 0.022781765088438988, "learning_rate": 1e-06, "loss": 0.0576, "step": 74 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2618667483329773, "epoch": 0.19736842105263158, "grad_norm": 0.02607133612036705, "learning_rate": 1e-06, "loss": 0.0836, "step": 75 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.25442183017730713, "epoch": 0.2, "grad_norm": 0.019913366064429283, "learning_rate": 1e-06, "loss": 0.0734, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14265.0, "completions/mean_length": 2636.556640625, "completions/mean_terminated_length": 1930.8358154296875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.2596082091331482, "epoch": 0.2026315789473684, "frac_reward_zero_std": 0.1875, "grad_norm": 0.02666478045284748, "learning_rate": 1e-06, "loss": 0.0508, "num_tokens": 33113044.0, "reward": 0.6970432996749878, "reward_std": 0.27402693033218384, "rewards/progression_diversity/mean": -0.0027059155981987715, "rewards/progression_diversity/std": 0.023960862308740616, "rewards/symbolic_reward_accuracy/mean": 0.75, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.837890625, "rewards/symbolic_reward_partial_score/std": 0.3269626796245575, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0499635934829712, "sampling/importance_sampling_ratio/min": 5.463769866764778e-06, "sampling/sampling_logp_difference/max": 12.117371559143066, "sampling/sampling_logp_difference/mean": 0.09771312028169632, "step": 77 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2613191306591034, "epoch": 0.20526315789473684, "grad_norm": 0.025247380137443542, "learning_rate": 1e-06, "loss": 0.0371, "step": 78 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2529866099357605, "epoch": 0.20789473684210527, "grad_norm": 0.030076855793595314, "learning_rate": 1e-06, "loss": 0.0572, "step": 79 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.24038981646299362, "epoch": 0.21052631578947367, "grad_norm": 0.0339297391474247, "learning_rate": 1e-06, "loss": 0.1197, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8195.0, "completions/mean_length": 2479.08984375, "completions/mean_terminated_length": 1765.2855224609375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.2412530928850174, "epoch": 0.2131578947368421, "frac_reward_zero_std": 0.25, "grad_norm": 0.02495790272951126, "learning_rate": 1e-06, "loss": 0.0973, "num_tokens": 34795330.0, "reward": 0.7531094551086426, "reward_std": 0.24608904123306274, "rewards/progression_diversity/mean": -0.0015581449260935187, "rewards/progression_diversity/std": 0.013487322255969048, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.8756510019302368, "rewards/symbolic_reward_partial_score/std": 0.30089128017425537, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0499427318572998, "sampling/importance_sampling_ratio/min": 4.466129244207195e-09, "sampling/sampling_logp_difference/max": 19.226743698120117, "sampling/sampling_logp_difference/mean": 0.09754176437854767, "step": 81 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2646462917327881, "epoch": 0.21578947368421053, "grad_norm": 0.02527514286339283, "learning_rate": 1e-06, "loss": 0.051, "step": 82 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2567380368709564, "epoch": 0.21842105263157896, "grad_norm": 0.032149430364370346, "learning_rate": 1e-06, "loss": 0.0786, "step": 83 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25139138102531433, "epoch": 0.22105263157894736, "grad_norm": 0.03584575280547142, "learning_rate": 1e-06, "loss": 0.078, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14563.0, "completions/mean_length": 2363.416015625, "completions/mean_terminated_length": 1793.4735107421875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.2689914107322693, "epoch": 0.2236842105263158, "frac_reward_zero_std": 0.21875, "grad_norm": 0.028427930548787117, "learning_rate": 1e-06, "loss": 0.0551, "num_tokens": 36393687.0, "reward": 0.7451353073120117, "reward_std": 0.23895391821861267, "rewards/progression_diversity/mean": -0.003074061591178179, "rewards/progression_diversity/std": 0.030696000903844833, "rewards/symbolic_reward_accuracy/mean": 0.8125, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.87060546875, "rewards/symbolic_reward_partial_score/std": 0.3043627142906189, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0520421266555786, "sampling/importance_sampling_ratio/min": 0.0007652377826161683, "sampling/sampling_logp_difference/max": 7.175323963165283, "sampling/sampling_logp_difference/mean": 0.10205866396427155, "step": 85 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.26435205340385437, "epoch": 0.22631578947368422, "grad_norm": 0.02536817453801632, "learning_rate": 1e-06, "loss": 0.0536, "step": 86 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.26483944058418274, "epoch": 0.22894736842105262, "grad_norm": 0.029015418142080307, "learning_rate": 1e-06, "loss": 0.0774, "step": 87 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.24875866621732712, "epoch": 0.23157894736842105, "grad_norm": 0.04244063422083855, "learning_rate": 1e-06, "loss": 0.1163, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13828.0, "completions/mean_length": 2104.162109375, "completions/mean_terminated_length": 1673.1810302734375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.26499253511428833, "epoch": 0.23421052631578948, "frac_reward_zero_std": 0.375, "grad_norm": 0.018304875120520592, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 37858282.0, "reward": 0.785420298576355, "reward_std": 0.17559418082237244, "rewards/progression_diversity/mean": -0.002899130806326866, "rewards/progression_diversity/std": 0.025376563891768456, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.9124348759651184, "rewards/symbolic_reward_partial_score/std": 0.25231799483299255, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052797794342041, "sampling/importance_sampling_ratio/min": 3.9022021169898835e-09, "sampling/sampling_logp_difference/max": 19.361724853515625, "sampling/sampling_logp_difference/mean": 0.10318418592214584, "step": 89 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2627365291118622, "epoch": 0.23684210526315788, "grad_norm": 0.012172169052064419, "learning_rate": 1e-06, "loss": 0.0412, "step": 90 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2614479660987854, "epoch": 0.2394736842105263, "grad_norm": 0.033741675317287445, "learning_rate": 1e-06, "loss": 0.1004, "step": 91 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.26432569324970245, "epoch": 0.24210526315789474, "grad_norm": 0.024774247780442238, "learning_rate": 1e-06, "loss": 0.0617, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12066.0, "completions/mean_length": 2051.4296875, "completions/mean_terminated_length": 1589.088623046875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "entropy": 0.2490312159061432, "epoch": 0.24473684210526317, "frac_reward_zero_std": 0.28125, "grad_norm": 0.035675231367349625, "learning_rate": 1e-06, "loss": 0.0635, "num_tokens": 39296742.0, "reward": 0.7815226316452026, "reward_std": 0.21832206845283508, "rewards/progression_diversity/mean": -0.002041286788880825, "rewards/progression_diversity/std": 0.023288603872060776, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.9000650644302368, "rewards/symbolic_reward_partial_score/std": 0.2744283676147461, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0516581535339355, "sampling/importance_sampling_ratio/min": 8.132886725187305e-13, "sampling/sampling_logp_difference/max": 27.837690353393555, "sampling/sampling_logp_difference/mean": 0.10091866552829742, "step": 93 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2668968141078949, "epoch": 0.24736842105263157, "grad_norm": 0.012349041178822517, "learning_rate": 1e-06, "loss": 0.0034, "step": 94 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.25852371752262115, "epoch": 0.25, "grad_norm": 0.030288243666291237, "learning_rate": 1e-06, "loss": 0.0817, "step": 95 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.25272224843502045, "epoch": 0.25263157894736843, "grad_norm": 0.037656910717487335, "learning_rate": 1e-06, "loss": 0.0626, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12648.0, "completions/mean_length": 1926.58984375, "completions/mean_terminated_length": 1638.5936279296875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.2652004808187485, "epoch": 0.25526315789473686, "frac_reward_zero_std": 0.34375, "grad_norm": 0.040096741169691086, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 40685300.0, "reward": 0.7350056171417236, "reward_std": 0.2433113306760788, "rewards/progression_diversity/mean": -0.00041471776785328984, "rewards/progression_diversity/std": 0.008273917250335217, "rewards/symbolic_reward_accuracy/mean": 0.798828125, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.8575846552848816, "rewards/symbolic_reward_partial_score/std": 0.31621140241622925, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0552291870117188, "sampling/importance_sampling_ratio/min": 8.425282566342125e-15, "sampling/sampling_logp_difference/max": 32.40753936767578, "sampling/sampling_logp_difference/mean": 0.10859895497560501, "step": 97 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.26686596870422363, "epoch": 0.2578947368421053, "grad_norm": 0.014504051767289639, "learning_rate": 1e-06, "loss": 0.0306, "step": 98 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.252104789018631, "epoch": 0.26052631578947366, "grad_norm": 0.025143684819340706, "learning_rate": 1e-06, "loss": 0.0521, "step": 99 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.25955578684806824, "epoch": 0.2631578947368421, "grad_norm": 0.028664739802479744, "learning_rate": 1e-06, "loss": 0.0281, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11806.0, "completions/mean_length": 2146.349609375, "completions/mean_terminated_length": 1804.6461181640625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.25793255865573883, "epoch": 0.2657894736842105, "frac_reward_zero_std": 0.28125, "grad_norm": 0.04812149703502655, "learning_rate": 1e-06, "loss": 0.0663, "num_tokens": 42194055.0, "reward": 0.7436953783035278, "reward_std": 0.23294387757778168, "rewards/progression_diversity/mean": -0.0005821330123580992, "rewards/progression_diversity/std": 0.009706917218863964, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.8722330331802368, "rewards/symbolic_reward_partial_score/std": 0.2982480227947235, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0527888536453247, "sampling/importance_sampling_ratio/min": 3.632110292528523e-06, "sampling/sampling_logp_difference/max": 12.525696754455566, "sampling/sampling_logp_difference/mean": 0.10401815176010132, "step": 101 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2468215674161911, "epoch": 0.26842105263157895, "grad_norm": 0.03846008703112602, "learning_rate": 1e-06, "loss": 0.0883, "step": 102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2601003050804138, "epoch": 0.2710526315789474, "grad_norm": 0.017417294904589653, "learning_rate": 1e-06, "loss": 0.0133, "step": 103 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2558285742998123, "epoch": 0.2736842105263158, "grad_norm": 0.023298203945159912, "learning_rate": 1e-06, "loss": 0.0655, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14029.0, "completions/mean_length": 1896.59375, "completions/mean_terminated_length": 1608.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.25534459948539734, "epoch": 0.27631578947368424, "frac_reward_zero_std": 0.34375, "grad_norm": 0.03245214372873306, "learning_rate": 1e-06, "loss": 0.0691, "num_tokens": 43574071.0, "reward": 0.7646946907043457, "reward_std": 0.20825007557868958, "rewards/progression_diversity/mean": -0.00025633774930611253, "rewards/progression_diversity/std": 0.0033328270073980093, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.87841796875, "rewards/symbolic_reward_partial_score/std": 0.30651092529296875, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0528056621551514, "sampling/importance_sampling_ratio/min": 0.0034063623752444983, "sampling/sampling_logp_difference/max": 5.68211030960083, "sampling/sampling_logp_difference/mean": 0.10405570268630981, "step": 105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.25604379177093506, "epoch": 0.2789473684210526, "grad_norm": 0.01872321590781212, "learning_rate": 1e-06, "loss": 0.0386, "step": 106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.25383952260017395, "epoch": 0.28157894736842104, "grad_norm": 0.019688135012984276, "learning_rate": 1e-06, "loss": 0.0223, "step": 107 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24521780759096146, "epoch": 0.28421052631578947, "grad_norm": 0.032386377453804016, "learning_rate": 1e-06, "loss": 0.0405, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14031.0, "completions/mean_length": 2017.2890625, "completions/mean_terminated_length": 1731.099609375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.2612617313861847, "epoch": 0.2868421052631579, "frac_reward_zero_std": 0.28125, "grad_norm": 0.0264718197286129, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 45012075.0, "reward": 0.7735224366188049, "reward_std": 0.20979472994804382, "rewards/progression_diversity/mean": -0.0012756988871842623, "rewards/progression_diversity/std": 0.01783159375190735, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.8876953125, "rewards/symbolic_reward_partial_score/std": 0.29055485129356384, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0521562099456787, "sampling/importance_sampling_ratio/min": 5.862594480277039e-05, "sampling/sampling_logp_difference/max": 9.744333267211914, "sampling/sampling_logp_difference/mean": 0.10334809869527817, "step": 109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.25815196335315704, "epoch": 0.2894736842105263, "grad_norm": 0.021397482603788376, "learning_rate": 1e-06, "loss": 0.0101, "step": 110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2469303384423256, "epoch": 0.29210526315789476, "grad_norm": 0.028907410800457, "learning_rate": 1e-06, "loss": 0.1008, "step": 111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.24571603536605835, "epoch": 0.29473684210526313, "grad_norm": 0.043796975165605545, "learning_rate": 1e-06, "loss": 0.0896, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12841.0, "completions/mean_length": 1870.52734375, "completions/mean_terminated_length": 1581.4144287109375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.2544310688972473, "epoch": 0.29736842105263156, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02333790808916092, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 46372409.0, "reward": 0.7861639857292175, "reward_std": 0.2023507058620453, "rewards/progression_diversity/mean": -0.0017700331518426538, "rewards/progression_diversity/std": 0.019489416852593422, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, "rewards/symbolic_reward_partial_score/std": 0.28922733664512634, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0533685684204102, "sampling/importance_sampling_ratio/min": 2.793473868223373e-05, "sampling/sampling_logp_difference/max": 10.485639572143555, "sampling/sampling_logp_difference/mean": 0.10594035685062408, "step": 113 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2571485936641693, "epoch": 0.3, "grad_norm": 0.033841848373413086, "learning_rate": 1e-06, "loss": 0.0585, "step": 114 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.24314817041158676, "epoch": 0.3026315789473684, "grad_norm": 0.022232208400964737, "learning_rate": 1e-06, "loss": 0.1467, "step": 115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.26761798560619354, "epoch": 0.30526315789473685, "grad_norm": 0.012662280350923538, "learning_rate": 1e-06, "loss": 0.0325, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12353.0, "completions/mean_length": 1801.455078125, "completions/mean_terminated_length": 1628.53955078125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.24800319969654083, "epoch": 0.3078947368421053, "frac_reward_zero_std": 0.3125, "grad_norm": 0.021536562591791153, "learning_rate": 1e-06, "loss": 0.0693, "num_tokens": 47720738.0, "reward": 0.7720678448677063, "reward_std": 0.22057949006557465, "rewards/progression_diversity/mean": -0.0002511652419343591, "rewards/progression_diversity/std": 0.004545476287603378, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.9055989980697632, "rewards/symbolic_reward_partial_score/std": 0.2584453523159027, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0540456771850586, "sampling/importance_sampling_ratio/min": 0.0003275219933129847, "sampling/sampling_logp_difference/max": 8.023955345153809, "sampling/sampling_logp_difference/mean": 0.10680307447910309, "step": 117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2550651431083679, "epoch": 0.3105263157894737, "grad_norm": 0.014003097079694271, "learning_rate": 1e-06, "loss": 0.0051, "step": 118 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25681254267692566, "epoch": 0.3131578947368421, "grad_norm": 0.030175212770700455, "learning_rate": 1e-06, "loss": 0.0557, "step": 119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.25778651237487793, "epoch": 0.3157894736842105, "grad_norm": 0.020599860697984695, "learning_rate": 1e-06, "loss": 0.01, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11032.0, "completions/mean_length": 2128.177734375, "completions/mean_terminated_length": 1786.0380859375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.26357313990592957, "epoch": 0.31842105263157894, "frac_reward_zero_std": 0.25, "grad_norm": 0.032891228795051575, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 49218685.0, "reward": 0.7386098504066467, "reward_std": 0.24482640624046326, "rewards/progression_diversity/mean": -0.0013221381232142448, "rewards/progression_diversity/std": 0.017824513837695122, "rewards/symbolic_reward_accuracy/mean": 0.798828125, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.87158203125, "rewards/symbolic_reward_partial_score/std": 0.29892396926879883, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052980661392212, "sampling/importance_sampling_ratio/min": 0.006169522181153297, "sampling/sampling_logp_difference/max": 5.088133811950684, "sampling/sampling_logp_difference/mean": 0.10582087934017181, "step": 121 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.24658922851085663, "epoch": 0.32105263157894737, "grad_norm": 0.024154705926775932, "learning_rate": 1e-06, "loss": 0.0446, "step": 122 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.25428229570388794, "epoch": 0.3236842105263158, "grad_norm": 0.03276718035340309, "learning_rate": 1e-06, "loss": 0.0973, "step": 123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2513224929571152, "epoch": 0.3263157894736842, "grad_norm": 0.027816835790872574, "learning_rate": 1e-06, "loss": 0.0737, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10545.0, "completions/mean_length": 2134.193359375, "completions/mean_terminated_length": 1704.11865234375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "entropy": 0.24834615737199783, "epoch": 0.32894736842105265, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02625676989555359, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 50700704.0, "reward": 0.7628857493400574, "reward_std": 0.21416853368282318, "rewards/progression_diversity/mean": -0.000490223930682987, "rewards/progression_diversity/std": 0.006874173413962126, "rewards/symbolic_reward_accuracy/mean": 0.828125, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.8951822519302368, "rewards/symbolic_reward_partial_score/std": 0.26842740178108215, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0525569915771484, "sampling/importance_sampling_ratio/min": 0.0012693606549873948, "sampling/sampling_logp_difference/max": 6.669241905212402, "sampling/sampling_logp_difference/mean": 0.10405848920345306, "step": 125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2593626528978348, "epoch": 0.33157894736842103, "grad_norm": 0.028445186093449593, "learning_rate": 1e-06, "loss": 0.0248, "step": 126 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.25416844338178635, "epoch": 0.33421052631578946, "grad_norm": 0.02981475181877613, "learning_rate": 1e-06, "loss": 0.0686, "step": 127 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2521153539419174, "epoch": 0.3368421052631579, "grad_norm": 0.03827949985861778, "learning_rate": 1e-06, "loss": 0.089, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11825.0, "completions/mean_length": 2242.904296875, "completions/mean_terminated_length": 1668.06298828125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.2524839788675308, "epoch": 0.3394736842105263, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0338742621243, "learning_rate": 1e-06, "loss": 0.0749, "num_tokens": 52248207.0, "reward": 0.7660582065582275, "reward_std": 0.2173374444246292, "rewards/progression_diversity/mean": -0.0006320271058939397, "rewards/progression_diversity/std": 0.007326250895857811, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.8966470956802368, "rewards/symbolic_reward_partial_score/std": 0.2703816890716553, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0492503643035889, "sampling/importance_sampling_ratio/min": 0.0005629548104479909, "sampling/sampling_logp_difference/max": 7.482311248779297, "sampling/sampling_logp_difference/mean": 0.09761972725391388, "step": 129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.24737834185361862, "epoch": 0.34210526315789475, "grad_norm": 0.027043595910072327, "learning_rate": 1e-06, "loss": 0.0991, "step": 130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.23644034564495087, "epoch": 0.3447368421052632, "grad_norm": 0.02878217026591301, "learning_rate": 1e-06, "loss": 0.0564, "step": 131 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24825213849544525, "epoch": 0.3473684210526316, "grad_norm": 0.03835158050060272, "learning_rate": 1e-06, "loss": 0.0377, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15272.0, "completions/mean_length": 1937.564453125, "completions/mean_terminated_length": 1620.377197265625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.24670010060071945, "epoch": 0.35, "frac_reward_zero_std": 0.4375, "grad_norm": 0.022889573127031326, "learning_rate": 1e-06, "loss": 0.0667, "num_tokens": 53642704.0, "reward": 0.8256836533546448, "reward_std": 0.16099657118320465, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9384765625, "rewards/symbolic_reward_partial_score/std": 0.2168099284172058, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0539462566375732, "sampling/importance_sampling_ratio/min": 8.059991523623466e-05, "sampling/sampling_logp_difference/max": 9.426012992858887, "sampling/sampling_logp_difference/mean": 0.1068400889635086, "step": 133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25666534900665283, "epoch": 0.3526315789473684, "grad_norm": 0.019193602725863457, "learning_rate": 1e-06, "loss": 0.0584, "step": 134 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.25828083604574203, "epoch": 0.35526315789473684, "grad_norm": 0.025437770411372185, "learning_rate": 1e-06, "loss": 0.0151, "step": 135 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2562948167324066, "epoch": 0.35789473684210527, "grad_norm": 0.026910221204161644, "learning_rate": 1e-06, "loss": 0.047, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13533.0, "completions/mean_length": 2228.55859375, "completions/mean_terminated_length": 1771.931396484375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.2502548471093178, "epoch": 0.3605263157894737, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0259707048535347, "learning_rate": 1e-06, "loss": 0.0639, "num_tokens": 55173518.0, "reward": 0.7800472974777222, "reward_std": 0.17662927508354187, "rewards/progression_diversity/mean": -0.00308664096519351, "rewards/progression_diversity/std": 0.02825590781867504, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.9095051884651184, "rewards/symbolic_reward_partial_score/std": 0.25963231921195984, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506607294082642, "sampling/importance_sampling_ratio/min": 0.00025443226331844926, "sampling/sampling_logp_difference/max": 8.27647590637207, "sampling/sampling_logp_difference/mean": 0.10103225708007812, "step": 137 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24615327268838882, "epoch": 0.3631578947368421, "grad_norm": 0.03407861292362213, "learning_rate": 1e-06, "loss": 0.0456, "step": 138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.24503520876169205, "epoch": 0.36578947368421055, "grad_norm": 0.020230667665600777, "learning_rate": 1e-06, "loss": 0.052, "step": 139 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2436116561293602, "epoch": 0.3684210526315789, "grad_norm": 0.033303920179605484, "learning_rate": 1e-06, "loss": 0.0703, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13365.0, "completions/mean_length": 1732.8828125, "completions/mean_terminated_length": 1500.325439453125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.25387296825647354, "epoch": 0.37105263157894736, "frac_reward_zero_std": 0.5, "grad_norm": 0.02063675969839096, "learning_rate": 1e-06, "loss": -0.0168, "num_tokens": 56458706.0, "reward": 0.8216304779052734, "reward_std": 0.14440101385116577, "rewards/progression_diversity/mean": -4.0891380194807425e-05, "rewards/progression_diversity/std": 0.0009252663003280759, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9431965947151184, "rewards/symbolic_reward_partial_score/std": 0.1989695280790329, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0523567199707031, "sampling/importance_sampling_ratio/min": 1.1014159326805384e-06, "sampling/sampling_logp_difference/max": 13.718914031982422, "sampling/sampling_logp_difference/mean": 0.10537827014923096, "step": 141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24923139810562134, "epoch": 0.3736842105263158, "grad_norm": 0.02487753890454769, "learning_rate": 1e-06, "loss": 0.0127, "step": 142 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.248472198843956, "epoch": 0.3763157894736842, "grad_norm": 0.017968177795410156, "learning_rate": 1e-06, "loss": 0.0407, "step": 143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24632388353347778, "epoch": 0.37894736842105264, "grad_norm": 0.025321682915091515, "learning_rate": 1e-06, "loss": 0.0718, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13227.0, "completions/mean_length": 2052.263671875, "completions/mean_terminated_length": 1708.3021240234375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "entropy": 0.2430901676416397, "epoch": 0.3815789473684211, "frac_reward_zero_std": 0.375, "grad_norm": 0.042146675288677216, "learning_rate": 1e-06, "loss": 0.069, "num_tokens": 57902585.0, "reward": 0.7910541296005249, "reward_std": 0.1890793740749359, "rewards/progression_diversity/mean": -0.0010371711105108261, "rewards/progression_diversity/std": 0.013147015124559402, "rewards/symbolic_reward_accuracy/mean": 0.86328125, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, "rewards/symbolic_reward_partial_score/std": 0.24215349555015564, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506622791290283, "sampling/importance_sampling_ratio/min": 3.2121545423535736e-09, "sampling/sampling_logp_difference/max": 19.556324005126953, "sampling/sampling_logp_difference/mean": 0.10158580541610718, "step": 145 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.24240544438362122, "epoch": 0.38421052631578945, "grad_norm": 0.023529332131147385, "learning_rate": 1e-06, "loss": 0.0249, "step": 146 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2464822679758072, "epoch": 0.3868421052631579, "grad_norm": 0.020634565502405167, "learning_rate": 1e-06, "loss": 0.0987, "step": 147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2451770007610321, "epoch": 0.3894736842105263, "grad_norm": 0.030732842162251472, "learning_rate": 1e-06, "loss": 0.0308, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9631.0, "completions/mean_length": 2138.572265625, "completions/mean_terminated_length": 1679.042236328125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.2397768646478653, "epoch": 0.39210526315789473, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02241184562444687, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 59407358.0, "reward": 0.7700119018554688, "reward_std": 0.21146252751350403, "rewards/progression_diversity/mean": -0.0007633853820152581, "rewards/progression_diversity/std": 0.009358874522149563, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9007161855697632, "rewards/symbolic_reward_partial_score/std": 0.2675977051258087, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0503795146942139, "sampling/importance_sampling_ratio/min": 1.3879385960535728e-06, "sampling/sampling_logp_difference/max": 13.487690925598145, "sampling/sampling_logp_difference/mean": 0.10065832734107971, "step": 149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24372172355651855, "epoch": 0.39473684210526316, "grad_norm": 0.034816596657037735, "learning_rate": 1e-06, "loss": 0.0347, "step": 150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2407890260219574, "epoch": 0.3973684210526316, "grad_norm": 0.021488770842552185, "learning_rate": 1e-06, "loss": 0.0215, "step": 151 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2366773709654808, "epoch": 0.4, "grad_norm": 0.02249423786997795, "learning_rate": 1e-06, "loss": 0.0609, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13010.0, "completions/mean_length": 2161.275390625, "completions/mean_terminated_length": 1672.8182373046875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.23152073472738266, "epoch": 0.4026315789473684, "frac_reward_zero_std": 0.21875, "grad_norm": 0.03908664733171463, "learning_rate": 1e-06, "loss": 0.0989, "num_tokens": 60915915.0, "reward": 0.7834299206733704, "reward_std": 0.21882712841033936, "rewards/progression_diversity/mean": -0.0017377887852489948, "rewards/progression_diversity/std": 0.021848157048225403, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.90185546875, "rewards/symbolic_reward_partial_score/std": 0.2732160687446594, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0461015701293945, "sampling/importance_sampling_ratio/min": 5.239376719146094e-07, "sampling/sampling_logp_difference/max": 14.461893081665039, "sampling/sampling_logp_difference/mean": 0.09288465231657028, "step": 153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2292015701532364, "epoch": 0.4052631578947368, "grad_norm": 0.030572297051548958, "learning_rate": 1e-06, "loss": 0.1011, "step": 154 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23775313049554825, "epoch": 0.40789473684210525, "grad_norm": 0.028754258528351784, "learning_rate": 1e-06, "loss": 0.0394, "step": 155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.24100537598133087, "epoch": 0.4105263157894737, "grad_norm": 0.021660154685378075, "learning_rate": 1e-06, "loss": 0.0211, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13572.0, "completions/mean_length": 1708.77734375, "completions/mean_terminated_length": 1534.762939453125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.24858221411705017, "epoch": 0.4131578947368421, "frac_reward_zero_std": 0.46875, "grad_norm": 0.022633863613009453, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 62199769.0, "reward": 0.8241158723831177, "reward_std": 0.1481999158859253, "rewards/progression_diversity/mean": -0.0005272195558063686, "rewards/progression_diversity/std": 0.010804719291627407, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9384765625, "rewards/symbolic_reward_partial_score/std": 0.21358926594257355, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0517460107803345, "sampling/importance_sampling_ratio/min": 0.0012036137050017715, "sampling/sampling_logp_difference/max": 6.722426891326904, "sampling/sampling_logp_difference/mean": 0.10469282418489456, "step": 157 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2418229877948761, "epoch": 0.41578947368421054, "grad_norm": 0.024082506075501442, "learning_rate": 1e-06, "loss": 0.0616, "step": 158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24460408091545105, "epoch": 0.41842105263157897, "grad_norm": 0.0060430532321333885, "learning_rate": 1e-06, "loss": 0.0383, "step": 159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.23893345147371292, "epoch": 0.42105263157894735, "grad_norm": 0.014973205514252186, "learning_rate": 1e-06, "loss": 0.0374, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12721.0, "completions/mean_length": 2082.78125, "completions/mean_terminated_length": 1768.782470703125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.23413384705781937, "epoch": 0.4236842105263158, "frac_reward_zero_std": 0.34375, "grad_norm": 0.02440241537988186, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 63668137.0, "reward": 0.8117564916610718, "reward_std": 0.18439695239067078, "rewards/progression_diversity/mean": -0.0011134764645248652, "rewards/progression_diversity/std": 0.014692641794681549, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9278970956802368, "rewards/symbolic_reward_partial_score/std": 0.23557829856872559, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048673152923584, "sampling/importance_sampling_ratio/min": 0.0006473406101576984, "sampling/sampling_logp_difference/max": 7.34263801574707, "sampling/sampling_logp_difference/mean": 0.09804990142583847, "step": 161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2300802245736122, "epoch": 0.4263157894736842, "grad_norm": 0.02143077924847603, "learning_rate": 1e-06, "loss": 0.0697, "step": 162 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.24065203219652176, "epoch": 0.42894736842105263, "grad_norm": 0.02908634953200817, "learning_rate": 1e-06, "loss": 0.0755, "step": 163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.24057897180318832, "epoch": 0.43157894736842106, "grad_norm": 0.04149603471159935, "learning_rate": 1e-06, "loss": 0.0485, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11887.0, "completions/mean_length": 2101.234375, "completions/mean_terminated_length": 1845.6778564453125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.23806512355804443, "epoch": 0.4342105263157895, "frac_reward_zero_std": 0.375, "grad_norm": 0.03864562138915062, "learning_rate": 1e-06, "loss": 0.0451, "num_tokens": 65144289.0, "reward": 0.7944764494895935, "reward_std": 0.19753678143024445, "rewards/progression_diversity/mean": -0.000595143239479512, "rewards/progression_diversity/std": 0.010785636492073536, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.91845703125, "rewards/symbolic_reward_partial_score/std": 0.24648991227149963, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049810767173767, "sampling/importance_sampling_ratio/min": 1.5343101040343754e-05, "sampling/sampling_logp_difference/max": 11.084844589233398, "sampling/sampling_logp_difference/mean": 0.10091371834278107, "step": 165 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.23936307430267334, "epoch": 0.4368421052631579, "grad_norm": 0.02704034186899662, "learning_rate": 1e-06, "loss": 0.0388, "step": 166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2366686388850212, "epoch": 0.4394736842105263, "grad_norm": 0.016262901946902275, "learning_rate": 1e-06, "loss": 0.0106, "step": 167 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.22937704622745514, "epoch": 0.4421052631578947, "grad_norm": 0.013757162727415562, "learning_rate": 1e-06, "loss": 0.0324, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11079.0, "completions/mean_length": 1793.64453125, "completions/mean_terminated_length": 1443.47607421875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.23620514571666718, "epoch": 0.44473684210526315, "frac_reward_zero_std": 0.46875, "grad_norm": 0.018099864944815636, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 66446891.0, "reward": 0.8357792496681213, "reward_std": 0.14379340410232544, "rewards/progression_diversity/mean": -0.0011789561249315739, "rewards/progression_diversity/std": 0.015437182039022446, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.94482421875, "rewards/symbolic_reward_partial_score/std": 0.2143392264842987, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049529790878296, "sampling/importance_sampling_ratio/min": 0.0015639930497854948, "sampling/sampling_logp_difference/max": 6.460513114929199, "sampling/sampling_logp_difference/mean": 0.10102123022079468, "step": 169 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.24607934057712555, "epoch": 0.4473684210526316, "grad_norm": 0.014026135206222534, "learning_rate": 1e-06, "loss": 0.0313, "step": 170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.23312748968601227, "epoch": 0.45, "grad_norm": 0.03011462651193142, "learning_rate": 1e-06, "loss": 0.0888, "step": 171 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23896265774965286, "epoch": 0.45263157894736844, "grad_norm": 0.009486875496804714, "learning_rate": 1e-06, "loss": 0.0374, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10726.0, "completions/mean_length": 1911.1640625, "completions/mean_terminated_length": 1652.2066650390625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.23735451698303223, "epoch": 0.45526315789473687, "frac_reward_zero_std": 0.5, "grad_norm": 0.0280209518969059, "learning_rate": 1e-06, "loss": 0.0433, "num_tokens": 67804351.0, "reward": 0.8081526756286621, "reward_std": 0.16803589463233948, "rewards/progression_diversity/mean": -0.00016519335622433573, "rewards/progression_diversity/std": 0.003562908386811614, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, "rewards/symbolic_reward_partial_score/std": 0.2344280630350113, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0501618385314941, "sampling/importance_sampling_ratio/min": 0.0007048218976706266, "sampling/sampling_logp_difference/max": 7.257565498352051, "sampling/sampling_logp_difference/mean": 0.10282571613788605, "step": 173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2369169294834137, "epoch": 0.45789473684210524, "grad_norm": 0.026422906666994095, "learning_rate": 1e-06, "loss": 0.0448, "step": 174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2406521737575531, "epoch": 0.4605263157894737, "grad_norm": 0.028511840850114822, "learning_rate": 1e-06, "loss": 0.0538, "step": 175 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2376062050461769, "epoch": 0.4631578947368421, "grad_norm": 0.029201552271842957, "learning_rate": 1e-06, "loss": 0.0522, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13314.0, "completions/mean_length": 1935.361328125, "completions/mean_terminated_length": 1647.5399169921875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.23343181610107422, "epoch": 0.46578947368421053, "frac_reward_zero_std": 0.3125, "grad_norm": 0.036340776830911636, "learning_rate": 1e-06, "loss": 0.0636, "num_tokens": 69200088.0, "reward": 0.8155167102813721, "reward_std": 0.1823035329580307, "rewards/progression_diversity/mean": -0.0010712584480643272, "rewards/progression_diversity/std": 0.016564983874559402, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9358723759651184, "rewards/symbolic_reward_partial_score/std": 0.22121170163154602, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0475256443023682, "sampling/importance_sampling_ratio/min": 1.0533493565237362e-12, "sampling/sampling_logp_difference/max": 27.57904624938965, "sampling/sampling_logp_difference/mean": 0.09736086428165436, "step": 177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22760003805160522, "epoch": 0.46842105263157896, "grad_norm": 0.034887488931417465, "learning_rate": 1e-06, "loss": 0.0751, "step": 178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22835474461317062, "epoch": 0.4710526315789474, "grad_norm": 0.02664666809141636, "learning_rate": 1e-06, "loss": 0.0702, "step": 179 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.23141715675592422, "epoch": 0.47368421052631576, "grad_norm": 0.02732260897755623, "learning_rate": 1e-06, "loss": 0.0256, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13435.0, "completions/mean_length": 2063.080078125, "completions/mean_terminated_length": 1748.648681640625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.22150472551584244, "epoch": 0.4763157894736842, "frac_reward_zero_std": 0.375, "grad_norm": 0.048945918679237366, "learning_rate": 1e-06, "loss": 0.0967, "num_tokens": 70669697.0, "reward": 0.7893031239509583, "reward_std": 0.20550626516342163, "rewards/progression_diversity/mean": -0.00035783840576186776, "rewards/progression_diversity/std": 0.008096958510577679, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.90380859375, "rewards/symbolic_reward_partial_score/std": 0.27613478899002075, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0472935438156128, "sampling/importance_sampling_ratio/min": 0.004802103620022535, "sampling/sampling_logp_difference/max": 5.338701248168945, "sampling/sampling_logp_difference/mean": 0.09636872261762619, "step": 181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22790049761533737, "epoch": 0.4789473684210526, "grad_norm": 0.021670255810022354, "learning_rate": 1e-06, "loss": 0.0223, "step": 182 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22305145859718323, "epoch": 0.48157894736842105, "grad_norm": 0.012241641990840435, "learning_rate": 1e-06, "loss": 0.0197, "step": 183 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.21923468261957169, "epoch": 0.4842105263157895, "grad_norm": 0.029154837131500244, "learning_rate": 1e-06, "loss": 0.074, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10743.0, "completions/mean_length": 1856.03515625, "completions/mean_terminated_length": 1507.364013671875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.2165418118238449, "epoch": 0.4868421052631579, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04361012578010559, "learning_rate": 1e-06, "loss": 0.1, "num_tokens": 72034643.0, "reward": 0.8248018026351929, "reward_std": 0.15697109699249268, "rewards/progression_diversity/mean": -0.0002908821334131062, "rewards/progression_diversity/std": 0.00464981934055686, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9518228769302368, "rewards/symbolic_reward_partial_score/std": 0.18324565887451172, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0449192523956299, "sampling/importance_sampling_ratio/min": 0.000814249215181917, "sampling/sampling_logp_difference/max": 7.11324405670166, "sampling/sampling_logp_difference/mean": 0.09324932098388672, "step": 185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22823286801576614, "epoch": 0.48947368421052634, "grad_norm": 0.015330553986132145, "learning_rate": 1e-06, "loss": 0.0087, "step": 186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.22463513165712357, "epoch": 0.4921052631578947, "grad_norm": 0.028646033257246017, "learning_rate": 1e-06, "loss": 0.0249, "step": 187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.21679264307022095, "epoch": 0.49473684210526314, "grad_norm": 0.027475610375404358, "learning_rate": 1e-06, "loss": 0.1208, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14325.0, "completions/mean_length": 1838.8671875, "completions/mean_terminated_length": 1489.7840576171875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.2217376008629799, "epoch": 0.49736842105263157, "frac_reward_zero_std": 0.375, "grad_norm": 0.03827635198831558, "learning_rate": 1e-06, "loss": 0.0414, "num_tokens": 73378767.0, "reward": 0.8262168169021606, "reward_std": 0.1503533273935318, "rewards/progression_diversity/mean": -0.00038728650542907417, "rewards/progression_diversity/std": 0.0051419991068542, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.94091796875, "rewards/symbolic_reward_partial_score/std": 0.215196430683136, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0445411205291748, "sampling/importance_sampling_ratio/min": 8.583670023654122e-07, "sampling/sampling_logp_difference/max": 13.968234062194824, "sampling/sampling_logp_difference/mean": 0.09397557377815247, "step": 189 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21815990656614304, "epoch": 0.5, "grad_norm": 0.027534402906894684, "learning_rate": 1e-06, "loss": 0.0625, "step": 190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.21029751747846603, "epoch": 0.5026315789473684, "grad_norm": 0.028749065473675728, "learning_rate": 1e-06, "loss": 0.0977, "step": 191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21950938552618027, "epoch": 0.5052631578947369, "grad_norm": 0.02661977894604206, "learning_rate": 1e-06, "loss": 0.0369, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12097.0, "completions/mean_length": 2130.484375, "completions/mean_terminated_length": 1817.532958984375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.2030806839466095, "epoch": 0.5078947368421053, "frac_reward_zero_std": 0.3125, "grad_norm": 0.025007417425513268, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 74868871.0, "reward": 0.8149902820587158, "reward_std": 0.18850557506084442, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.92236328125, "rewards/symbolic_reward_partial_score/std": 0.2531226873397827, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420916080474854, "sampling/importance_sampling_ratio/min": 5.281509857013589e-06, "sampling/sampling_logp_difference/max": 12.151298522949219, "sampling/sampling_logp_difference/mean": 0.08910438418388367, "step": 193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.20039298385381699, "epoch": 0.5105263157894737, "grad_norm": 0.019791144877672195, "learning_rate": 1e-06, "loss": 0.0733, "step": 194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.20205791294574738, "epoch": 0.5131578947368421, "grad_norm": 0.03766762465238571, "learning_rate": 1e-06, "loss": 0.054, "step": 195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.19714680314064026, "epoch": 0.5157894736842106, "grad_norm": 0.04690009355545044, "learning_rate": 1e-06, "loss": 0.1071, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11150.0, "completions/mean_length": 1767.6796875, "completions/mean_terminated_length": 1386.893798828125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.19719091057777405, "epoch": 0.5184210526315789, "frac_reward_zero_std": 0.34375, "grad_norm": 0.031063755974173546, "learning_rate": 1e-06, "loss": 0.0426, "num_tokens": 76177731.0, "reward": 0.823818027973175, "reward_std": 0.16009867191314697, "rewards/progression_diversity/mean": -0.0010085422545671463, "rewards/progression_diversity/std": 0.010762260295450687, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9459635019302368, "rewards/symbolic_reward_partial_score/std": 0.19425591826438904, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0391511917114258, "sampling/importance_sampling_ratio/min": 0.0001364499912597239, "sampling/sampling_logp_difference/max": 8.899552345275879, "sampling/sampling_logp_difference/mean": 0.08513116836547852, "step": 197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.19633030891418457, "epoch": 0.5210526315789473, "grad_norm": 0.018081026151776314, "learning_rate": 1e-06, "loss": 0.0274, "step": 198 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.19438430666923523, "epoch": 0.5236842105263158, "grad_norm": 0.03003780171275139, "learning_rate": 1e-06, "loss": 0.0837, "step": 199 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.1886511892080307, "epoch": 0.5263157894736842, "grad_norm": 0.032905541360378265, "learning_rate": 1e-06, "loss": 0.0643, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16023.0, "completions/mean_length": 2301.400390625, "completions/mean_terminated_length": 1728.9368896484375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.18248793482780457, "epoch": 0.5289473684210526, "frac_reward_zero_std": 0.1875, "grad_norm": 0.04558183625340462, "learning_rate": 1e-06, "loss": 0.1227, "num_tokens": 77773552.0, "reward": 0.8007115721702576, "reward_std": 0.21243008971214294, "rewards/progression_diversity/mean": -0.0020881860982626677, "rewards/progression_diversity/std": 0.021307511255145073, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9197590947151184, "rewards/symbolic_reward_partial_score/std": 0.2490549385547638, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0355485677719116, "sampling/importance_sampling_ratio/min": 1.0011056567060805e-10, "sampling/sampling_logp_difference/max": 23.02474594116211, "sampling/sampling_logp_difference/mean": 0.07838210463523865, "step": 201 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.17451690137386322, "epoch": 0.531578947368421, "grad_norm": 0.04084772244095802, "learning_rate": 1e-06, "loss": 0.1141, "step": 202 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.17165183275938034, "epoch": 0.5342105263157895, "grad_norm": 0.031080789864063263, "learning_rate": 1e-06, "loss": 0.0879, "step": 203 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.16927115619182587, "epoch": 0.5368421052631579, "grad_norm": 0.016663571819663048, "learning_rate": 1e-06, "loss": 0.0779, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16068.0, "completions/mean_length": 2071.619140625, "completions/mean_terminated_length": 1550.115478515625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.16112020611763, "epoch": 0.5394736842105263, "frac_reward_zero_std": 0.25, "grad_norm": 0.018799329176545143, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 79247213.0, "reward": 0.8091070055961609, "reward_std": 0.1729590892791748, "rewards/progression_diversity/mean": -0.002385494764894247, "rewards/progression_diversity/std": 0.028513550758361816, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9347330331802368, "rewards/symbolic_reward_partial_score/std": 0.21787308156490326, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0297162532806396, "sampling/importance_sampling_ratio/min": 1.7676053403192782e-06, "sampling/sampling_logp_difference/max": 13.245884895324707, "sampling/sampling_logp_difference/mean": 0.0692635178565979, "step": 205 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.15220321714878082, "epoch": 0.5421052631578948, "grad_norm": 0.037522438913583755, "learning_rate": 1e-06, "loss": 0.1625, "step": 206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.15934187173843384, "epoch": 0.5447368421052632, "grad_norm": 0.013017321936786175, "learning_rate": 1e-06, "loss": 0.0563, "step": 207 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.15468831360340118, "epoch": 0.5473684210526316, "grad_norm": 0.009292000904679298, "learning_rate": 1e-06, "loss": 0.0932, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9757.0, "completions/mean_length": 1568.158203125, "completions/mean_terminated_length": 1212.5780029296875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.15928836166858673, "epoch": 0.55, "frac_reward_zero_std": 0.4375, "grad_norm": 0.027132874354720116, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 80432382.0, "reward": 0.8553647994995117, "reward_std": 0.11348331719636917, "rewards/progression_diversity/mean": -0.0006339521496556699, "rewards/progression_diversity/std": 0.00838653463870287, "rewards/symbolic_reward_accuracy/mean": 0.9453125, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.9690755009651184, "rewards/symbolic_reward_partial_score/std": 0.1494486927986145, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0306223630905151, "sampling/importance_sampling_ratio/min": 7.626142905792221e-05, "sampling/sampling_logp_difference/max": 9.481343269348145, "sampling/sampling_logp_difference/mean": 0.07124973833560944, "step": 209 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.15551680326461792, "epoch": 0.5526315789473685, "grad_norm": 0.018515659496188164, "learning_rate": 1e-06, "loss": 0.0522, "step": 210 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.15634237229824066, "epoch": 0.5552631578947368, "grad_norm": 0.012999890372157097, "learning_rate": 1e-06, "loss": 0.0665, "step": 211 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.15420294553041458, "epoch": 0.5578947368421052, "grad_norm": 0.021096454933285713, "learning_rate": 1e-06, "loss": 0.131, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15015.0, "completions/mean_length": 2094.546875, "completions/mean_terminated_length": 1422.44580078125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 0.1454745978116989, "epoch": 0.5605263157894737, "frac_reward_zero_std": 0.34375, "grad_norm": 0.021531764417886734, "learning_rate": 1e-06, "loss": 0.0801, "num_tokens": 81900598.0, "reward": 0.7925652265548706, "reward_std": 0.19417259097099304, "rewards/progression_diversity/mean": -0.0012926449999213219, "rewards/progression_diversity/std": 0.012499667704105377, "rewards/symbolic_reward_accuracy/mean": 0.873046875, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.91015625, "rewards/symbolic_reward_partial_score/std": 0.2634425461292267, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0279793739318848, "sampling/importance_sampling_ratio/min": 3.422269249670623e-13, "sampling/sampling_logp_difference/max": 28.70330238342285, "sampling/sampling_logp_difference/mean": 0.06678377091884613, "step": 213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.14702380448579788, "epoch": 0.5631578947368421, "grad_norm": 0.020642530173063278, "learning_rate": 1e-06, "loss": 0.0657, "step": 214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.14872030168771744, "epoch": 0.5657894736842105, "grad_norm": 0.016585860401391983, "learning_rate": 1e-06, "loss": -0.0113, "step": 215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.14099013805389404, "epoch": 0.5684210526315789, "grad_norm": 0.03327646851539612, "learning_rate": 1e-06, "loss": 0.1437, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12443.0, "completions/mean_length": 2076.515625, "completions/mean_terminated_length": 1342.045166015625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.15217551589012146, "epoch": 0.5710526315789474, "frac_reward_zero_std": 0.21875, "grad_norm": 0.030961882323026657, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 83365918.0, "reward": 0.8457842469215393, "reward_std": 0.14305046200752258, "rewards/progression_diversity/mean": -0.0016604659613221884, "rewards/progression_diversity/std": 0.020232077687978745, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9638671875, "rewards/symbolic_reward_partial_score/std": 0.15897713601589203, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.025794267654419, "sampling/importance_sampling_ratio/min": 1.3557089005189482e-05, "sampling/sampling_logp_difference/max": 11.208600997924805, "sampling/sampling_logp_difference/mean": 0.06135455146431923, "step": 217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.13979589194059372, "epoch": 0.5736842105263158, "grad_norm": 0.030766695737838745, "learning_rate": 1e-06, "loss": 0.136, "step": 218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.1349509060382843, "epoch": 0.5763157894736842, "grad_norm": 0.024950722232460976, "learning_rate": 1e-06, "loss": 0.1487, "step": 219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.13936427235603333, "epoch": 0.5789473684210527, "grad_norm": 0.025016427040100098, "learning_rate": 1e-06, "loss": 0.1389, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9998.0, "completions/mean_length": 2004.419921875, "completions/mean_terminated_length": 1450.2373046875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.13566848635673523, "epoch": 0.5815789473684211, "frac_reward_zero_std": 0.21875, "grad_norm": 0.03230908513069153, "learning_rate": 1e-06, "loss": 0.0997, "num_tokens": 84804853.0, "reward": 0.8050163984298706, "reward_std": 0.20277492702007294, "rewards/progression_diversity/mean": -0.001296035130508244, "rewards/progression_diversity/std": 0.020735615864396095, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9256185293197632, "rewards/symbolic_reward_partial_score/std": 0.23007379472255707, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0255374908447266, "sampling/importance_sampling_ratio/min": 0.00021752847533207387, "sampling/sampling_logp_difference/max": 8.433180809020996, "sampling/sampling_logp_difference/mean": 0.06265204399824142, "step": 221 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.13476034253835678, "epoch": 0.5842105263157895, "grad_norm": 0.038641829043626785, "learning_rate": 1e-06, "loss": 0.0417, "step": 222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.13380540907382965, "epoch": 0.5868421052631579, "grad_norm": 0.02986939251422882, "learning_rate": 1e-06, "loss": 0.0707, "step": 223 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.13185346126556396, "epoch": 0.5894736842105263, "grad_norm": 0.04221319779753685, "learning_rate": 1e-06, "loss": 0.0909, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11142.0, "completions/mean_length": 1651.677734375, "completions/mean_terminated_length": 1207.0401611328125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.13264429569244385, "epoch": 0.5921052631578947, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03359711542725563, "learning_rate": 1e-06, "loss": 0.0813, "num_tokens": 86012272.0, "reward": 0.8497519493103027, "reward_std": 0.12195061147212982, "rewards/progression_diversity/mean": -0.0003971385594923049, "rewards/progression_diversity/std": 0.007108698599040508, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.95556640625, "rewards/symbolic_reward_partial_score/std": 0.19622980058193207, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.024213194847107, "sampling/importance_sampling_ratio/min": 1.59300361701753e-05, "sampling/sampling_logp_difference/max": 11.047304153442383, "sampling/sampling_logp_difference/mean": 0.06262955069541931, "step": 225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.13049791753292084, "epoch": 0.5947368421052631, "grad_norm": 0.013412282802164555, "learning_rate": 1e-06, "loss": 0.0659, "step": 226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.12776117026805878, "epoch": 0.5973684210526315, "grad_norm": 0.023844925686717033, "learning_rate": 1e-06, "loss": 0.044, "step": 227 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.12381385266780853, "epoch": 0.6, "grad_norm": 0.028511611744761467, "learning_rate": 1e-06, "loss": 0.0867, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15721.0, "completions/mean_length": 1463.47265625, "completions/mean_terminated_length": 1196.5048828125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.1172075942158699, "epoch": 0.6026315789473684, "frac_reward_zero_std": 0.5, "grad_norm": 0.04271606728434563, "learning_rate": 1e-06, "loss": 0.0551, "num_tokens": 87146690.0, "reward": 0.8489208221435547, "reward_std": 0.12613338232040405, "rewards/progression_diversity/mean": -0.0005037329392507672, "rewards/progression_diversity/std": 0.008694959804415703, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, "rewards/symbolic_reward_partial_score/std": 0.17724183201789856, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0227758884429932, "sampling/importance_sampling_ratio/min": 8.189291838789359e-05, "sampling/sampling_logp_difference/max": 9.4100980758667, "sampling/sampling_logp_difference/mean": 0.05999467894434929, "step": 229 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11710315942764282, "epoch": 0.6052631578947368, "grad_norm": 0.008632275275886059, "learning_rate": 1e-06, "loss": 0.0071, "step": 230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.11399340257048607, "epoch": 0.6078947368421053, "grad_norm": 0.012347784824669361, "learning_rate": 1e-06, "loss": 0.0357, "step": 231 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.11190313473343849, "epoch": 0.6105263157894737, "grad_norm": 0.022991470992565155, "learning_rate": 1e-06, "loss": 0.0425, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14312.0, "completions/mean_length": 1782.529296875, "completions/mean_terminated_length": 1311.5140380859375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.10688180476427078, "epoch": 0.6131578947368421, "frac_reward_zero_std": 0.40625, "grad_norm": 0.03337698429822922, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 88459825.0, "reward": 0.7974966168403625, "reward_std": 0.17888271808624268, "rewards/progression_diversity/mean": -0.00131894217338413, "rewards/progression_diversity/std": 0.017751460894942284, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9187825322151184, "rewards/symbolic_reward_partial_score/std": 0.24961021542549133, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0183002948760986, "sampling/importance_sampling_ratio/min": 5.948247780906968e-05, "sampling/sampling_logp_difference/max": 9.729828834533691, "sampling/sampling_logp_difference/mean": 0.05178453028202057, "step": 233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.10239839181303978, "epoch": 0.6157894736842106, "grad_norm": 0.019929470494389534, "learning_rate": 1e-06, "loss": 0.0941, "step": 234 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.09998245164752007, "epoch": 0.618421052631579, "grad_norm": 0.027180753648281097, "learning_rate": 1e-06, "loss": 0.0925, "step": 235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.10064789652824402, "epoch": 0.6210526315789474, "grad_norm": 0.032762862741947174, "learning_rate": 1e-06, "loss": 0.0622, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15458.0, "completions/mean_length": 1760.564453125, "completions/mean_terminated_length": 1227.726806640625, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.09621577709913254, "epoch": 0.6236842105263158, "frac_reward_zero_std": 0.375, "grad_norm": 0.03691167011857033, "learning_rate": 1e-06, "loss": 0.1223, "num_tokens": 89768210.0, "reward": 0.8165925741195679, "reward_std": 0.1556319147348404, "rewards/progression_diversity/mean": -0.0008989507332444191, "rewards/progression_diversity/std": 0.016222195699810982, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9283853769302368, "rewards/symbolic_reward_partial_score/std": 0.23884880542755127, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0182127952575684, "sampling/importance_sampling_ratio/min": 0.0003808625042438507, "sampling/sampling_logp_difference/max": 7.873072147369385, "sampling/sampling_logp_difference/mean": 0.052828144282102585, "step": 237 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.10177255794405937, "epoch": 0.6263157894736842, "grad_norm": 0.01514112763106823, "learning_rate": 1e-06, "loss": 0.0734, "step": 238 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.10082132369279861, "epoch": 0.6289473684210526, "grad_norm": 0.02351570315659046, "learning_rate": 1e-06, "loss": 0.0691, "step": 239 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.10559947416186333, "epoch": 0.631578947368421, "grad_norm": 0.012419568374752998, "learning_rate": 1e-06, "loss": 0.0251, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8674.0, "completions/mean_length": 1312.580078125, "completions/mean_terminated_length": 1073.3511962890625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.10397671908140182, "epoch": 0.6342105263157894, "frac_reward_zero_std": 0.5, "grad_norm": 0.03270323947072029, "learning_rate": 1e-06, "loss": 0.036, "num_tokens": 90821019.0, "reward": 0.8449134826660156, "reward_std": 0.10425379872322083, "rewards/progression_diversity/mean": -0.0008408930152654648, "rewards/progression_diversity/std": 0.01656174287199974, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9615885615348816, "rewards/symbolic_reward_partial_score/std": 0.16416993737220764, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0181472301483154, "sampling/importance_sampling_ratio/min": 2.46100570477914e-11, "sampling/sampling_logp_difference/max": 24.427865982055664, "sampling/sampling_logp_difference/mean": 0.055217523127794266, "step": 241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.10521255061030388, "epoch": 0.6368421052631579, "grad_norm": 0.021176166832447052, "learning_rate": 1e-06, "loss": 0.0615, "step": 242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.10355697944760323, "epoch": 0.6394736842105263, "grad_norm": 0.015480482950806618, "learning_rate": 1e-06, "loss": 0.0404, "step": 243 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.10299444571137428, "epoch": 0.6421052631578947, "grad_norm": 0.046614497900009155, "learning_rate": 1e-06, "loss": 0.0686, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14336.0, "completions/mean_length": 1815.578125, "completions/mean_terminated_length": 1254.11767578125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.0950668603181839, "epoch": 0.6447368421052632, "frac_reward_zero_std": 0.375, "grad_norm": 0.03163347393274307, "learning_rate": 1e-06, "loss": 0.0472, "num_tokens": 92168259.0, "reward": 0.7833421230316162, "reward_std": 0.1979527473449707, "rewards/progression_diversity/mean": -0.0007511397707276046, "rewards/progression_diversity/std": 0.010229557752609253, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.91259765625, "rewards/symbolic_reward_partial_score/std": 0.2514844834804535, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0157463550567627, "sampling/importance_sampling_ratio/min": 8.007385726704896e-11, "sampling/sampling_logp_difference/max": 23.248071670532227, "sampling/sampling_logp_difference/mean": 0.04827887937426567, "step": 245 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.09082784876227379, "epoch": 0.6473684210526316, "grad_norm": 0.03819683939218521, "learning_rate": 1e-06, "loss": 0.0754, "step": 246 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.09024357795715332, "epoch": 0.65, "grad_norm": 0.03529435768723488, "learning_rate": 1e-06, "loss": 0.1095, "step": 247 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.09114789962768555, "epoch": 0.6526315789473685, "grad_norm": 0.014682702720165253, "learning_rate": 1e-06, "loss": 0.0538, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11281.0, "completions/mean_length": 1623.357421875, "completions/mean_terminated_length": 1147.2076416015625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.08885756507515907, "epoch": 0.6552631578947369, "frac_reward_zero_std": 0.375, "grad_norm": 0.04838551953434944, "learning_rate": 1e-06, "loss": 0.1366, "num_tokens": 93418746.0, "reward": 0.826103687286377, "reward_std": 0.17412379384040833, "rewards/progression_diversity/mean": -0.0019385741325095296, "rewards/progression_diversity/std": 0.022392842918634415, "rewards/symbolic_reward_accuracy/mean": 0.912109375, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.93994140625, "rewards/symbolic_reward_partial_score/std": 0.2181263417005539, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0159519910812378, "sampling/importance_sampling_ratio/min": 4.792552772414638e-06, "sampling/sampling_logp_difference/max": 12.24844741821289, "sampling/sampling_logp_difference/mean": 0.049355216324329376, "step": 249 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.09172747284173965, "epoch": 0.6578947368421053, "grad_norm": 0.020697645843029022, "learning_rate": 1e-06, "loss": 0.0594, "step": 250 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.09275993704795837, "epoch": 0.6605263157894737, "grad_norm": 0.02590845711529255, "learning_rate": 1e-06, "loss": 0.0501, "step": 251 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.09400615096092224, "epoch": 0.6631578947368421, "grad_norm": 0.03462434932589531, "learning_rate": 1e-06, "loss": 0.0493, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8934.0, "completions/mean_length": 1809.392578125, "completions/mean_terminated_length": 1278.333984375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "entropy": 0.09200675785541534, "epoch": 0.6657894736842105, "frac_reward_zero_std": 0.5, "grad_norm": 0.04529382660984993, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 94753315.0, "reward": 0.8259365558624268, "reward_std": 0.13518205285072327, "rewards/progression_diversity/mean": -0.004001125227659941, "rewards/progression_diversity/std": 0.04852549359202385, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.9329427480697632, "rewards/symbolic_reward_partial_score/std": 0.23484937846660614, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0148303508758545, "sampling/importance_sampling_ratio/min": 1.1414035583356963e-07, "sampling/sampling_logp_difference/max": 15.98583698272705, "sampling/sampling_logp_difference/mean": 0.04799918830394745, "step": 253 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.08636122196912766, "epoch": 0.6684210526315789, "grad_norm": 0.039753351360559464, "learning_rate": 1e-06, "loss": 0.1328, "step": 254 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.09036806598305702, "epoch": 0.6710526315789473, "grad_norm": 0.02018708921968937, "learning_rate": 1e-06, "loss": 0.0184, "step": 255 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.08820921182632446, "epoch": 0.6736842105263158, "grad_norm": 0.03188847377896309, "learning_rate": 1e-06, "loss": 0.0965, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13585.0, "completions/mean_length": 1527.927734375, "completions/mean_terminated_length": 1171.382080078125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.09194277971982956, "epoch": 0.6763157894736842, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04443329945206642, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 95937918.0, "reward": 0.8315865397453308, "reward_std": 0.13644886016845703, "rewards/progression_diversity/mean": -0.0005308896070346236, "rewards/progression_diversity/std": 0.0070388540625572205, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.95166015625, "rewards/symbolic_reward_partial_score/std": 0.1864015907049179, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.015270471572876, "sampling/importance_sampling_ratio/min": 1.5392264726326823e-19, "sampling/sampling_logp_difference/max": 43.31783676147461, "sampling/sampling_logp_difference/mean": 0.04934335872530937, "step": 257 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.08927204459905624, "epoch": 0.6789473684210526, "grad_norm": 0.030259989202022552, "learning_rate": 1e-06, "loss": 0.0617, "step": 258 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.0892256572842598, "epoch": 0.6815789473684211, "grad_norm": 0.03653491288423538, "learning_rate": 1e-06, "loss": 0.0888, "step": 259 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.08825241401791573, "epoch": 0.6842105263157895, "grad_norm": 0.02339651621878147, "learning_rate": 1e-06, "loss": 0.046, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10342.0, "completions/mean_length": 1722.841796875, "completions/mean_terminated_length": 1188.629638671875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.09076549112796783, "epoch": 0.6868421052631579, "frac_reward_zero_std": 0.4375, "grad_norm": 0.021180791780352592, "learning_rate": 1e-06, "loss": 0.0357, "num_tokens": 97201805.0, "reward": 0.8398886919021606, "reward_std": 0.13938692212104797, "rewards/progression_diversity/mean": -0.00038591912016272545, "rewards/progression_diversity/std": 0.005922461394220591, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9519857168197632, "rewards/symbolic_reward_partial_score/std": 0.19745849072933197, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0142853260040283, "sampling/importance_sampling_ratio/min": 2.9707528881317558e-08, "sampling/sampling_logp_difference/max": 17.331865310668945, "sampling/sampling_logp_difference/mean": 0.04651745408773422, "step": 261 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.09219764545559883, "epoch": 0.6894736842105263, "grad_norm": 0.03137506544589996, "learning_rate": 1e-06, "loss": 0.0417, "step": 262 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.09567607566714287, "epoch": 0.6921052631578948, "grad_norm": 0.022778861224651337, "learning_rate": 1e-06, "loss": 0.0223, "step": 263 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.0844137892127037, "epoch": 0.6947368421052632, "grad_norm": 0.03902408108115196, "learning_rate": 1e-06, "loss": 0.1838, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11656.0, "completions/mean_length": 1918.83203125, "completions/mean_terminated_length": 1269.37548828125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.09079957380890846, "epoch": 0.6973684210526315, "frac_reward_zero_std": 0.3125, "grad_norm": 0.03956228867173195, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 98594391.0, "reward": 0.7959408164024353, "reward_std": 0.18128597736358643, "rewards/progression_diversity/mean": -0.0006502005271613598, "rewards/progression_diversity/std": 0.010507218539714813, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.91748046875, "rewards/symbolic_reward_partial_score/std": 0.25184011459350586, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0138227939605713, "sampling/importance_sampling_ratio/min": 2.1744256395450634e-10, "sampling/sampling_logp_difference/max": 22.249086380004883, "sampling/sampling_logp_difference/mean": 0.04669389873743057, "step": 265 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.0856584906578064, "epoch": 0.7, "grad_norm": 0.04341353103518486, "learning_rate": 1e-06, "loss": 0.0789, "step": 266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.08574041724205017, "epoch": 0.7026315789473684, "grad_norm": 0.04117586463689804, "learning_rate": 1e-06, "loss": 0.1279, "step": 267 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.08641528338193893, "epoch": 0.7052631578947368, "grad_norm": 0.023934362456202507, "learning_rate": 1e-06, "loss": 0.0486, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6950.0, "completions/mean_length": 1519.431640625, "completions/mean_terminated_length": 1039.929443359375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.09421858936548233, "epoch": 0.7078947368421052, "frac_reward_zero_std": 0.4375, "grad_norm": 0.021700512617826462, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 99741620.0, "reward": 0.8472155332565308, "reward_std": 0.12852367758750916, "rewards/progression_diversity/mean": -0.0001352034305455163, "rewards/progression_diversity/std": 0.0023650997318327427, "rewards/symbolic_reward_accuracy/mean": 0.939453125, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.95556640625, "rewards/symbolic_reward_partial_score/std": 0.19245369732379913, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.014095664024353, "sampling/importance_sampling_ratio/min": 0.0012474657269194722, "sampling/sampling_logp_difference/max": 6.686641216278076, "sampling/sampling_logp_difference/mean": 0.047991082072257996, "step": 269 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.08904735371470451, "epoch": 0.7105263157894737, "grad_norm": 0.020569216459989548, "learning_rate": 1e-06, "loss": 0.0653, "step": 270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.08358699828386307, "epoch": 0.7131578947368421, "grad_norm": 0.036929428577423096, "learning_rate": 1e-06, "loss": 0.164, "step": 271 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.08975838124752045, "epoch": 0.7157894736842105, "grad_norm": 0.01153595745563507, "learning_rate": 1e-06, "loss": 0.0975, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11530.0, "completions/mean_length": 1798.42578125, "completions/mean_terminated_length": 1205.5162353515625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.08779791370034218, "epoch": 0.718421052631579, "frac_reward_zero_std": 0.40625, "grad_norm": 0.051671918481588364, "learning_rate": 1e-06, "loss": 0.0716, "num_tokens": 101081902.0, "reward": 0.8015538454055786, "reward_std": 0.1920056939125061, "rewards/progression_diversity/mean": -0.0008653616532683372, "rewards/progression_diversity/std": 0.011233367957174778, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9192708134651184, "rewards/symbolic_reward_partial_score/std": 0.24985045194625854, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.012866497039795, "sampling/importance_sampling_ratio/min": 5.919900445405801e-07, "sampling/sampling_logp_difference/max": 14.339776039123535, "sampling/sampling_logp_difference/mean": 0.047424331307411194, "step": 273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.08645644038915634, "epoch": 0.7210526315789474, "grad_norm": 0.04649584740400314, "learning_rate": 1e-06, "loss": 0.0667, "step": 274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.08046365156769753, "epoch": 0.7236842105263158, "grad_norm": 0.044324759393930435, "learning_rate": 1e-06, "loss": 0.1699, "step": 275 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.08621568232774734, "epoch": 0.7263157894736842, "grad_norm": 0.033357467502355576, "learning_rate": 1e-06, "loss": 0.0483, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11624.0, "completions/mean_length": 1809.94140625, "completions/mean_terminated_length": 1309.418212890625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.08192634955048561, "epoch": 0.7289473684210527, "frac_reward_zero_std": 0.28125, "grad_norm": 0.04480903223156929, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 102443600.0, "reward": 0.7796283960342407, "reward_std": 0.20830506086349487, "rewards/progression_diversity/mean": -0.001035432331264019, "rewards/progression_diversity/std": 0.010857795365154743, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.9021809697151184, "rewards/symbolic_reward_partial_score/std": 0.27368080615997314, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0123987197875977, "sampling/importance_sampling_ratio/min": 1.9622171695829784e-17, "sampling/sampling_logp_difference/max": 38.469871520996094, "sampling/sampling_logp_difference/mean": 0.04502936080098152, "step": 277 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.08124225959181786, "epoch": 0.7315789473684211, "grad_norm": 0.03922456502914429, "learning_rate": 1e-06, "loss": 0.0856, "step": 278 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.08088827505707741, "epoch": 0.7342105263157894, "grad_norm": 0.029460720717906952, "learning_rate": 1e-06, "loss": 0.0527, "step": 279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.07565705850720406, "epoch": 0.7368421052631579, "grad_norm": 0.03428329899907112, "learning_rate": 1e-06, "loss": 0.1428, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6622.0, "completions/mean_length": 1865.5, "completions/mean_terminated_length": 1213.64892578125, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.08282288908958435, "epoch": 0.7394736842105263, "frac_reward_zero_std": 0.375, "grad_norm": 0.03280191496014595, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 103822896.0, "reward": 0.799896240234375, "reward_std": 0.18378372490406036, "rewards/progression_diversity/mean": -0.0006127399392426014, "rewards/progression_diversity/std": 0.0077356030233204365, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9267578125, "rewards/symbolic_reward_partial_score/std": 0.23403851687908173, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0123448371887207, "sampling/importance_sampling_ratio/min": 1.5536484170297626e-06, "sampling/sampling_logp_difference/max": 13.37490463256836, "sampling/sampling_logp_difference/mean": 0.047334231436252594, "step": 281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.08162574470043182, "epoch": 0.7421052631578947, "grad_norm": 0.03995591774582863, "learning_rate": 1e-06, "loss": 0.1125, "step": 282 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.08600019663572311, "epoch": 0.7447368421052631, "grad_norm": 0.02127450704574585, "learning_rate": 1e-06, "loss": 0.0228, "step": 283 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.08134080097079277, "epoch": 0.7473684210526316, "grad_norm": 0.020339855924248695, "learning_rate": 1e-06, "loss": 0.1129, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16203.0, "completions/mean_length": 2301.19921875, "completions/mean_terminated_length": 1362.345947265625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.07727199792861938, "epoch": 0.75, "frac_reward_zero_std": 0.1875, "grad_norm": 0.059844452887773514, "learning_rate": 1e-06, "loss": 0.1395, "num_tokens": 105415446.0, "reward": 0.7762081623077393, "reward_std": 0.252108633518219, "rewards/progression_diversity/mean": -0.001261794357560575, "rewards/progression_diversity/std": 0.012555805034935474, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.8855794072151184, "rewards/symbolic_reward_partial_score/std": 0.30380791425704956, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0110630989074707, "sampling/importance_sampling_ratio/min": 1.2527775652415585e-05, "sampling/sampling_logp_difference/max": 11.287562370300293, "sampling/sampling_logp_difference/mean": 0.04290057718753815, "step": 285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.07825002819299698, "epoch": 0.7526315789473684, "grad_norm": 0.03701354190707207, "learning_rate": 1e-06, "loss": 0.0698, "step": 286 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.07666192576289177, "epoch": 0.7552631578947369, "grad_norm": 0.03503553569316864, "learning_rate": 1e-06, "loss": 0.1734, "step": 287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.07939735427498817, "epoch": 0.7578947368421053, "grad_norm": 0.03646966814994812, "learning_rate": 1e-06, "loss": 0.0965, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12705.0, "completions/mean_length": 2036.3125, "completions/mean_terminated_length": 1392.1304931640625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.07943989709019661, "epoch": 0.7605263157894737, "frac_reward_zero_std": 0.21875, "grad_norm": 0.054009877145290375, "learning_rate": 1e-06, "loss": 0.0678, "num_tokens": 106871190.0, "reward": 0.76430344581604, "reward_std": 0.23938891291618347, "rewards/progression_diversity/mean": -0.0003246946434956044, "rewards/progression_diversity/std": 0.005887071136385202, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.88232421875, "rewards/symbolic_reward_partial_score/std": 0.3022898733615875, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0122061967849731, "sampling/importance_sampling_ratio/min": 4.789482318301452e-06, "sampling/sampling_logp_difference/max": 12.249088287353516, "sampling/sampling_logp_difference/mean": 0.046712085604667664, "step": 289 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.08157060295343399, "epoch": 0.7631578947368421, "grad_norm": 0.04651428759098053, "learning_rate": 1e-06, "loss": 0.0802, "step": 290 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.07789277285337448, "epoch": 0.7657894736842106, "grad_norm": 0.026174401864409447, "learning_rate": 1e-06, "loss": 0.1247, "step": 291 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.08365096524357796, "epoch": 0.7684210526315789, "grad_norm": 0.03342805802822113, "learning_rate": 1e-06, "loss": 0.0482, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12578.0, "completions/mean_length": 1690.671875, "completions/mean_terminated_length": 1368.0638427734375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.08383786678314209, "epoch": 0.7710526315789473, "frac_reward_zero_std": 0.40625, "grad_norm": 0.051632702350616455, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 108138638.0, "reward": 0.8156664371490479, "reward_std": 0.16709403693675995, "rewards/progression_diversity/mean": -0.0007459928747266531, "rewards/progression_diversity/std": 0.013311400078237057, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.93310546875, "rewards/symbolic_reward_partial_score/std": 0.22900764644145966, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.013049602508545, "sampling/importance_sampling_ratio/min": 8.142708793457132e-06, "sampling/sampling_logp_difference/max": 11.718387603759766, "sampling/sampling_logp_difference/mean": 0.050904612988233566, "step": 293 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.0861295573413372, "epoch": 0.7736842105263158, "grad_norm": 0.021426010876893997, "learning_rate": 1e-06, "loss": 0.0375, "step": 294 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.08208976686000824, "epoch": 0.7763157894736842, "grad_norm": 0.042882923036813736, "learning_rate": 1e-06, "loss": 0.0493, "step": 295 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.08664394915103912, "epoch": 0.7789473684210526, "grad_norm": 0.023627059534192085, "learning_rate": 1e-06, "loss": 0.024, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11644.0, "completions/mean_length": 1897.166015625, "completions/mean_terminated_length": 1338.849853515625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.08488818258047104, "epoch": 0.781578947368421, "frac_reward_zero_std": 0.40625, "grad_norm": 0.07350046932697296, "learning_rate": 1e-06, "loss": 0.098, "num_tokens": 109518627.0, "reward": 0.8159432411193848, "reward_std": 0.1821536421775818, "rewards/progression_diversity/mean": -0.0023595020174980164, "rewards/progression_diversity/std": 0.03643770143389702, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9197590947151184, "rewards/symbolic_reward_partial_score/std": 0.26337599754333496, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0125732421875, "sampling/importance_sampling_ratio/min": 1.475510165438454e-11, "sampling/sampling_logp_difference/max": 24.93943214416504, "sampling/sampling_logp_difference/mean": 0.04736366495490074, "step": 297 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.07962662726640701, "epoch": 0.7842105263157895, "grad_norm": 0.038738593459129333, "learning_rate": 1e-06, "loss": 0.1205, "step": 298 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.0848800577223301, "epoch": 0.7868421052631579, "grad_norm": 0.015012623742222786, "learning_rate": 1e-06, "loss": 0.0471, "step": 299 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.08305321261286736, "epoch": 0.7894736842105263, "grad_norm": 0.014111812226474285, "learning_rate": 1e-06, "loss": 0.0205, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14519.0, "completions/mean_length": 1872.60546875, "completions/mean_terminated_length": 1404.4959716796875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.08086536824703217, "epoch": 0.7921052631578948, "frac_reward_zero_std": 0.34375, "grad_norm": 0.05553090572357178, "learning_rate": 1e-06, "loss": 0.0673, "num_tokens": 110901241.0, "reward": 0.8010022640228271, "reward_std": 0.2108319252729416, "rewards/progression_diversity/mean": -0.002319404622539878, "rewards/progression_diversity/std": 0.034735988825559616, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9109700918197632, "rewards/symbolic_reward_partial_score/std": 0.26965585350990295, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0123342275619507, "sampling/importance_sampling_ratio/min": 5.6792261458925644e-14, "sampling/sampling_logp_difference/max": 30.49937629699707, "sampling/sampling_logp_difference/mean": 0.04701223969459534, "step": 301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.08132592588663101, "epoch": 0.7947368421052632, "grad_norm": 0.05231938883662224, "learning_rate": 1e-06, "loss": 0.0842, "step": 302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.0790596604347229, "epoch": 0.7973684210526316, "grad_norm": 0.014687119983136654, "learning_rate": 1e-06, "loss": 0.1099, "step": 303 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.08050628378987312, "epoch": 0.8, "grad_norm": 0.027776561677455902, "learning_rate": 1e-06, "loss": 0.071, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14410.0, "completions/mean_length": 1649.3046875, "completions/mean_terminated_length": 1235.0762939453125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.07759291678667068, "epoch": 0.8026315789473685, "frac_reward_zero_std": 0.34375, "grad_norm": 0.05399545282125473, "learning_rate": 1e-06, "loss": 0.1019, "num_tokens": 112154165.0, "reward": 0.8109291791915894, "reward_std": 0.18250510096549988, "rewards/progression_diversity/mean": -0.0008340342901647091, "rewards/progression_diversity/std": 0.014236577786505222, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9231770634651184, "rewards/symbolic_reward_partial_score/std": 0.25037682056427, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0121451616287231, "sampling/importance_sampling_ratio/min": 2.1538072954964387e-18, "sampling/sampling_logp_difference/max": 40.67929458618164, "sampling/sampling_logp_difference/mean": 0.04661684110760689, "step": 305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.08159153163433075, "epoch": 0.8052631578947368, "grad_norm": 0.047688040882349014, "learning_rate": 1e-06, "loss": 0.0852, "step": 306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.08121361956000328, "epoch": 0.8078947368421052, "grad_norm": 0.017503326758742332, "learning_rate": 1e-06, "loss": 0.0402, "step": 307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.08150652050971985, "epoch": 0.8105263157894737, "grad_norm": 0.0493878610432148, "learning_rate": 1e-06, "loss": 0.0819, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9856.0, "completions/mean_length": 1619.880859375, "completions/mean_terminated_length": 1143.618896484375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.08610192313790321, "epoch": 0.8131578947368421, "frac_reward_zero_std": 0.3125, "grad_norm": 0.05817464739084244, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 113374680.0, "reward": 0.8267406821250916, "reward_std": 0.16926893591880798, "rewards/progression_diversity/mean": -0.0017180759459733963, "rewards/progression_diversity/std": 0.03334691748023033, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.9381510019302368, "rewards/symbolic_reward_partial_score/std": 0.22628821432590485, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0124841928482056, "sampling/importance_sampling_ratio/min": 7.453480520780431e-06, "sampling/sampling_logp_difference/max": 11.806829452514648, "sampling/sampling_logp_difference/mean": 0.047006309032440186, "step": 309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.08357677236199379, "epoch": 0.8157894736842105, "grad_norm": 0.032003868371248245, "learning_rate": 1e-06, "loss": 0.0531, "step": 310 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.08344599604606628, "epoch": 0.8184210526315789, "grad_norm": 0.032237276434898376, "learning_rate": 1e-06, "loss": 0.0649, "step": 311 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.08107833191752434, "epoch": 0.8210526315789474, "grad_norm": 0.03354015573859215, "learning_rate": 1e-06, "loss": 0.0988, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12625.0, "completions/mean_length": 2280.06640625, "completions/mean_terminated_length": 1308.396728515625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.07463454082608223, "epoch": 0.8236842105263158, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0700058713555336, "learning_rate": 1e-06, "loss": 0.0735, "num_tokens": 114952058.0, "reward": 0.7640925645828247, "reward_std": 0.19802439212799072, "rewards/progression_diversity/mean": -0.0018792236223816872, "rewards/progression_diversity/std": 0.02314453385770321, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.8849283456802368, "rewards/symbolic_reward_partial_score/std": 0.2975020110607147, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.011326789855957, "sampling/importance_sampling_ratio/min": 1.1499383845148259e-06, "sampling/sampling_logp_difference/max": 13.675802230834961, "sampling/sampling_logp_difference/mean": 0.04442109167575836, "step": 313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.0740601234138012, "epoch": 0.8263157894736842, "grad_norm": 0.03972548991441727, "learning_rate": 1e-06, "loss": 0.0904, "step": 314 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.0788533091545105, "epoch": 0.8289473684210527, "grad_norm": 0.01969255320727825, "learning_rate": 1e-06, "loss": 0.0561, "step": 315 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.0717739462852478, "epoch": 0.8315789473684211, "grad_norm": 0.019936300814151764, "learning_rate": 1e-06, "loss": 0.1043, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7844.0, "completions/mean_length": 2019.439453125, "completions/mean_terminated_length": 1188.4317626953125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.07409506663680077, "epoch": 0.8342105263157895, "frac_reward_zero_std": 0.3125, "grad_norm": 0.05379168689250946, "learning_rate": 1e-06, "loss": 0.1369, "num_tokens": 116408507.0, "reward": 0.794788122177124, "reward_std": 0.20335128903388977, "rewards/progression_diversity/mean": -0.003609658218920231, "rewards/progression_diversity/std": 0.039214227348566055, "rewards/symbolic_reward_accuracy/mean": 0.873046875, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.9215494990348816, "rewards/symbolic_reward_partial_score/std": 0.24459369480609894, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0107078552246094, "sampling/importance_sampling_ratio/min": 6.796539331332951e-09, "sampling/sampling_logp_difference/max": 18.806852340698242, "sampling/sampling_logp_difference/mean": 0.0424201525747776, "step": 317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.07684982568025589, "epoch": 0.8368421052631579, "grad_norm": 0.02561601623892784, "learning_rate": 1e-06, "loss": 0.0562, "step": 318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.07624761387705803, "epoch": 0.8394736842105263, "grad_norm": 0.04118689149618149, "learning_rate": 1e-06, "loss": 0.0745, "step": 319 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.07758089527487755, "epoch": 0.8421052631578947, "grad_norm": 0.02611648663878441, "learning_rate": 1e-06, "loss": 0.084, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14539.0, "completions/mean_length": 1871.75, "completions/mean_terminated_length": 1063.8515625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.0785975344479084, "epoch": 0.8447368421052631, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01912563666701317, "learning_rate": 1e-06, "loss": 0.0434, "num_tokens": 117755291.0, "reward": 0.8071247339248657, "reward_std": 0.1330493986606598, "rewards/progression_diversity/mean": -0.005300430115312338, "rewards/progression_diversity/std": 0.05674071982502937, "rewards/symbolic_reward_accuracy/mean": 0.888671875, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.9308267831802368, "rewards/symbolic_reward_partial_score/std": 0.2333908975124359, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0106561183929443, "sampling/importance_sampling_ratio/min": 9.975841486209447e-09, "sampling/sampling_logp_difference/max": 18.423099517822266, "sampling/sampling_logp_difference/mean": 0.045750319957733154, "step": 321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.07452228292822838, "epoch": 0.8473684210526315, "grad_norm": 0.028663959354162216, "learning_rate": 1e-06, "loss": 0.0673, "step": 322 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.07840859517455101, "epoch": 0.85, "grad_norm": 0.01813848502933979, "learning_rate": 1e-06, "loss": 0.0938, "step": 323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.07799200713634491, "epoch": 0.8526315789473684, "grad_norm": 0.015808766707777977, "learning_rate": 1e-06, "loss": 0.0841, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15622.0, "completions/mean_length": 1765.408203125, "completions/mean_terminated_length": 1077.826171875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.07237191870808601, "epoch": 0.8552631578947368, "frac_reward_zero_std": 0.40625, "grad_norm": 0.05738206207752228, "learning_rate": 1e-06, "loss": 0.1679, "num_tokens": 119072172.0, "reward": 0.8257243633270264, "reward_std": 0.1704496443271637, "rewards/progression_diversity/mean": -0.0008076863596215844, "rewards/progression_diversity/std": 0.008417648263275623, "rewards/symbolic_reward_accuracy/mean": 0.91796875, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.9314778447151184, "rewards/symbolic_reward_partial_score/std": 0.24522030353546143, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0103081464767456, "sampling/importance_sampling_ratio/min": 7.582560135332983e-10, "sampling/sampling_logp_difference/max": 21.0, "sampling/sampling_logp_difference/mean": 0.04462500289082527, "step": 325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.07543528452515602, "epoch": 0.8578947368421053, "grad_norm": 0.022561442106962204, "learning_rate": 1e-06, "loss": 0.0468, "step": 326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.07968015596270561, "epoch": 0.8605263157894737, "grad_norm": 0.03309568762779236, "learning_rate": 1e-06, "loss": 0.0492, "step": 327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.08061303943395615, "epoch": 0.8631578947368421, "grad_norm": 0.017540300264954567, "learning_rate": 1e-06, "loss": 0.0731, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8829.0, "completions/mean_length": 1729.509765625, "completions/mean_terminated_length": 1133.7987060546875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.0855712741613388, "epoch": 0.8657894736842106, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04006322845816612, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 120356817.0, "reward": 0.7963314652442932, "reward_std": 0.1821439266204834, "rewards/progression_diversity/mean": -0.0006411472568288445, "rewards/progression_diversity/std": 0.013062160462141037, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, "rewards/symbolic_reward_partial_score/std": 0.24649207293987274, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.011404275894165, "sampling/importance_sampling_ratio/min": 2.223772499476695e-09, "sampling/sampling_logp_difference/max": 19.924060821533203, "sampling/sampling_logp_difference/mean": 0.045835498720407486, "step": 329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.07828561961650848, "epoch": 0.868421052631579, "grad_norm": 0.032283343374729156, "learning_rate": 1e-06, "loss": 0.093, "step": 330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.08197357133030891, "epoch": 0.8710526315789474, "grad_norm": 0.04965886473655701, "learning_rate": 1e-06, "loss": 0.0956, "step": 331 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.08033063635230064, "epoch": 0.8736842105263158, "grad_norm": 0.029129937291145325, "learning_rate": 1e-06, "loss": 0.0689, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9487.0, "completions/mean_length": 1397.1328125, "completions/mean_terminated_length": 1006.6934204101562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.08633046597242355, "epoch": 0.8763157894736842, "frac_reward_zero_std": 0.375, "grad_norm": 0.034233320504426956, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 121475957.0, "reward": 0.8381311893463135, "reward_std": 0.1662929654121399, "rewards/progression_diversity/mean": -0.0003610485000535846, "rewards/progression_diversity/std": 0.006844029296189547, "rewards/symbolic_reward_accuracy/mean": 0.927734375, "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, "rewards/symbolic_reward_partial_score/mean": 0.94677734375, "rewards/symbolic_reward_partial_score/std": 0.2108106017112732, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0122419595718384, "sampling/importance_sampling_ratio/min": 3.6730430110765155e-06, "sampling/sampling_logp_difference/max": 12.514490127563477, "sampling/sampling_logp_difference/mean": 0.047133252024650574, "step": 333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.08650662377476692, "epoch": 0.8789473684210526, "grad_norm": 0.013222447596490383, "learning_rate": 1e-06, "loss": 0.0508, "step": 334 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.08855986595153809, "epoch": 0.881578947368421, "grad_norm": 0.010891803540289402, "learning_rate": 1e-06, "loss": 0.0263, "step": 335 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.0886671282351017, "epoch": 0.8842105263157894, "grad_norm": 0.014351065270602703, "learning_rate": 1e-06, "loss": 0.0698, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15569.0, "completions/mean_length": 1693.0078125, "completions/mean_terminated_length": 1064.67626953125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.08585496246814728, "epoch": 0.8868421052631579, "frac_reward_zero_std": 0.375, "grad_norm": 0.0539562962949276, "learning_rate": 1e-06, "loss": 0.1175, "num_tokens": 122741049.0, "reward": 0.8183258771896362, "reward_std": 0.15719908475875854, "rewards/progression_diversity/mean": -0.0033535552211105824, "rewards/progression_diversity/std": 0.04218889772891998, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9290364384651184, "rewards/symbolic_reward_partial_score/std": 0.24505099654197693, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0127696990966797, "sampling/importance_sampling_ratio/min": 7.183120487752603e-06, "sampling/sampling_logp_difference/max": 11.84377670288086, "sampling/sampling_logp_difference/mean": 0.045910563319921494, "step": 337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.08782363682985306, "epoch": 0.8894736842105263, "grad_norm": 0.026008745655417442, "learning_rate": 1e-06, "loss": 0.0413, "step": 338 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.09032351523637772, "epoch": 0.8921052631578947, "grad_norm": 0.05031033605337143, "learning_rate": 1e-06, "loss": 0.1113, "step": 339 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.0947401411831379, "epoch": 0.8947368421052632, "grad_norm": 0.020250199362635612, "learning_rate": 1e-06, "loss": 0.0466, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7096.0, "completions/mean_length": 1569.30078125, "completions/mean_terminated_length": 967.0772094726562, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.09787581861019135, "epoch": 0.8973684210526316, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03366163745522499, "learning_rate": 1e-06, "loss": 0.0513, "num_tokens": 123942163.0, "reward": 0.8242777585983276, "reward_std": 0.17191796004772186, "rewards/progression_diversity/mean": -0.0038672885857522488, "rewards/progression_diversity/std": 0.05004805698990822, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.9326171875, "rewards/symbolic_reward_partial_score/std": 0.23826108872890472, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0147292613983154, "sampling/importance_sampling_ratio/min": 4.3840016587637365e-05, "sampling/sampling_logp_difference/max": 10.034963607788086, "sampling/sampling_logp_difference/mean": 0.05117640271782875, "step": 341 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.09365754574537277, "epoch": 0.9, "grad_norm": 0.029882870614528656, "learning_rate": 1e-06, "loss": 0.0784, "step": 342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.09345110133290291, "epoch": 0.9026315789473685, "grad_norm": 0.03513422608375549, "learning_rate": 1e-06, "loss": 0.0559, "step": 343 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.09143765270709991, "epoch": 0.9052631578947369, "grad_norm": 0.024734172970056534, "learning_rate": 1e-06, "loss": 0.0746, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6936.0, "completions/mean_length": 1666.32421875, "completions/mean_terminated_length": 1005.5305786132812, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.09469415619969368, "epoch": 0.9078947368421053, "frac_reward_zero_std": 0.34375, "grad_norm": 0.03798873722553253, "learning_rate": 1e-06, "loss": 0.0727, "num_tokens": 125223993.0, "reward": 0.810152530670166, "reward_std": 0.19300349056720734, "rewards/progression_diversity/mean": -0.00526084192097187, "rewards/progression_diversity/std": 0.053911034017801285, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, "rewards/symbolic_reward_partial_score/std": 0.26354485750198364, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0131090879440308, "sampling/importance_sampling_ratio/min": 1.7146856407634914e-05, "sampling/sampling_logp_difference/max": 10.973695755004883, "sampling/sampling_logp_difference/mean": 0.04458559304475784, "step": 345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.08893976360559464, "epoch": 0.9105263157894737, "grad_norm": 0.03366508707404137, "learning_rate": 1e-06, "loss": 0.1022, "step": 346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.09565364941954613, "epoch": 0.9131578947368421, "grad_norm": 0.01510405819863081, "learning_rate": 1e-06, "loss": 0.0209, "step": 347 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.09264839813113213, "epoch": 0.9157894736842105, "grad_norm": 0.015571820549666882, "learning_rate": 1e-06, "loss": 0.1362, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8325.0, "completions/mean_length": 1417.078125, "completions/mean_terminated_length": 934.274169921875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.09942467883229256, "epoch": 0.9184210526315789, "frac_reward_zero_std": 0.3125, "grad_norm": 0.046768125146627426, "learning_rate": 1e-06, "loss": 0.0793, "num_tokens": 126347329.0, "reward": 0.8201934099197388, "reward_std": 0.18495051562786102, "rewards/progression_diversity/mean": -0.0021503996104002, "rewards/progression_diversity/std": 0.03275972977280617, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9280598759651184, "rewards/symbolic_reward_partial_score/std": 0.24548624455928802, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0158005952835083, "sampling/importance_sampling_ratio/min": 1.7636549500821275e-06, "sampling/sampling_logp_difference/max": 13.248122215270996, "sampling/sampling_logp_difference/mean": 0.05201897770166397, "step": 349 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.09871027618646622, "epoch": 0.9210526315789473, "grad_norm": 0.015367010608315468, "learning_rate": 1e-06, "loss": 0.0738, "step": 350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.09856873750686646, "epoch": 0.9236842105263158, "grad_norm": 0.02123577892780304, "learning_rate": 1e-06, "loss": 0.0286, "step": 351 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.09770620614290237, "epoch": 0.9263157894736842, "grad_norm": 0.02374398149549961, "learning_rate": 1e-06, "loss": 0.072, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12225.0, "completions/mean_length": 2309.302734375, "completions/mean_terminated_length": 1084.1168212890625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.09652888029813766, "epoch": 0.9289473684210526, "frac_reward_zero_std": 0.375, "grad_norm": 0.02025141753256321, "learning_rate": 1e-06, "loss": 0.0611, "num_tokens": 127938332.0, "reward": 0.7528777122497559, "reward_std": 0.21120837330818176, "rewards/progression_diversity/mean": -0.0052050938829779625, "rewards/progression_diversity/std": 0.045792821794748306, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.8645833134651184, "rewards/symbolic_reward_partial_score/std": 0.32852864265441895, "rewards/tag_count_reward/mean": -0.080078125, "rewards/tag_count_reward/std": 0.271679550409317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0146894454956055, "sampling/importance_sampling_ratio/min": 3.716075696047483e-07, "sampling/sampling_logp_difference/max": 14.805427551269531, "sampling/sampling_logp_difference/mean": 0.050460390746593475, "step": 353 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.09619471430778503, "epoch": 0.9315789473684211, "grad_norm": 0.033115409314632416, "learning_rate": 1e-06, "loss": 0.0835, "step": 354 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.09719686955213547, "epoch": 0.9342105263157895, "grad_norm": 0.03017679788172245, "learning_rate": 1e-06, "loss": 0.111, "step": 355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.09859620407223701, "epoch": 0.9368421052631579, "grad_norm": 0.02311846613883972, "learning_rate": 1e-06, "loss": 0.0881, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14940.0, "completions/mean_length": 1620.341796875, "completions/mean_terminated_length": 894.2601928710938, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10532217100262642, "epoch": 0.9394736842105263, "frac_reward_zero_std": 0.46875, "grad_norm": 0.028915587812662125, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 129171915.0, "reward": 0.8127701282501221, "reward_std": 0.16265010833740234, "rewards/progression_diversity/mean": -0.0022891198750585318, "rewards/progression_diversity/std": 0.022422684356570244, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9202474355697632, "rewards/symbolic_reward_partial_score/std": 0.2603859007358551, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0165646076202393, "sampling/importance_sampling_ratio/min": 5.568129235709263e-17, "sampling/sampling_logp_difference/max": 37.42688751220703, "sampling/sampling_logp_difference/mean": 0.053267642855644226, "step": 357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.10380205139517784, "epoch": 0.9421052631578948, "grad_norm": 0.039858750998973846, "learning_rate": 1e-06, "loss": 0.1027, "step": 358 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.10536672174930573, "epoch": 0.9447368421052632, "grad_norm": 0.011492653749883175, "learning_rate": 1e-06, "loss": 0.0268, "step": 359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.10100537538528442, "epoch": 0.9473684210526315, "grad_norm": 0.04140660539269447, "learning_rate": 1e-06, "loss": 0.0916, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5861.0, "completions/mean_length": 1051.1953125, "completions/mean_terminated_length": 869.3834228515625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.11518026143312454, "epoch": 0.95, "frac_reward_zero_std": 0.40625, "grad_norm": 0.029431862756609917, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 130096271.0, "reward": 0.8326168060302734, "reward_std": 0.1631331741809845, "rewards/progression_diversity/mean": -4.066104520461522e-05, "rewards/progression_diversity/std": 0.000920054386369884, "rewards/symbolic_reward_accuracy/mean": 0.91796875, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.943359375, "rewards/symbolic_reward_partial_score/std": 0.21264995634555817, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.019619345664978, "sampling/importance_sampling_ratio/min": 5.807437264593318e-05, "sampling/sampling_logp_difference/max": 9.753786087036133, "sampling/sampling_logp_difference/mean": 0.06247701495885849, "step": 361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.11114299297332764, "epoch": 0.9526315789473684, "grad_norm": 0.045638490468263626, "learning_rate": 1e-06, "loss": 0.058, "step": 362 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.10937144979834557, "epoch": 0.9552631578947368, "grad_norm": 0.013277344405651093, "learning_rate": 1e-06, "loss": 0.0497, "step": 363 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11187266558408737, "epoch": 0.9578947368421052, "grad_norm": 0.013971379958093166, "learning_rate": 1e-06, "loss": 0.0041, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6367.0, "completions/mean_length": 1118.857421875, "completions/mean_terminated_length": 845.7236328125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.10873980447649956, "epoch": 0.9605263157894737, "frac_reward_zero_std": 0.5, "grad_norm": 0.03028622269630432, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 131061606.0, "reward": 0.8580078482627869, "reward_std": 0.12159307301044464, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.951171875, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, "rewards/symbolic_reward_partial_score/std": 0.17610274255275726, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0192532539367676, "sampling/importance_sampling_ratio/min": 3.5043937196554964e-10, "sampling/sampling_logp_difference/max": 21.771833419799805, "sampling/sampling_logp_difference/mean": 0.062029507011175156, "step": 365 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11343821510672569, "epoch": 0.9631578947368421, "grad_norm": 0.006393097806721926, "learning_rate": 1e-06, "loss": 0.0393, "step": 366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.10997330024838448, "epoch": 0.9657894736842105, "grad_norm": 0.04175656661391258, "learning_rate": 1e-06, "loss": 0.0229, "step": 367 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.11030839011073112, "epoch": 0.968421052631579, "grad_norm": 0.009800048545002937, "learning_rate": 1e-06, "loss": 0.0279, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8872.0, "completions/mean_length": 2057.54296875, "completions/mean_terminated_length": 941.5873413085938, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.10226080939173698, "epoch": 0.9710526315789474, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0458550862967968, "learning_rate": 1e-06, "loss": 0.1033, "num_tokens": 132521884.0, "reward": 0.7442119121551514, "reward_std": 0.21518045663833618, "rewards/progression_diversity/mean": -0.0026392091531306505, "rewards/progression_diversity/std": 0.02361941523849964, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8590494394302368, "rewards/symbolic_reward_partial_score/std": 0.33246058225631714, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0173622369766235, "sampling/importance_sampling_ratio/min": 5.933624197496101e-05, "sampling/sampling_logp_difference/max": 9.732290267944336, "sampling/sampling_logp_difference/mean": 0.05440949648618698, "step": 369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.10158997401595116, "epoch": 0.9736842105263158, "grad_norm": 0.038810838013887405, "learning_rate": 1e-06, "loss": 0.0925, "step": 370 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.10655439645051956, "epoch": 0.9763157894736842, "grad_norm": 0.019916830584406853, "learning_rate": 1e-06, "loss": 0.0455, "step": 371 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.10742965340614319, "epoch": 0.9789473684210527, "grad_norm": 0.01620314083993435, "learning_rate": 1e-06, "loss": 0.0595, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7506.0, "completions/mean_length": 2036.57421875, "completions/mean_terminated_length": 820.690673828125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.1056484505534172, "epoch": 0.9815789473684211, "frac_reward_zero_std": 0.46875, "grad_norm": 0.028215283527970314, "learning_rate": 1e-06, "loss": 0.0996, "num_tokens": 133962402.0, "reward": 0.7769756317138672, "reward_std": 0.14990827441215515, "rewards/progression_diversity/mean": -0.002632810501381755, "rewards/progression_diversity/std": 0.029632003977894783, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.9005533456802368, "rewards/symbolic_reward_partial_score/std": 0.28045758605003357, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0167696475982666, "sampling/importance_sampling_ratio/min": 2.0920060388868178e-14, "sampling/sampling_logp_difference/max": 31.49806785583496, "sampling/sampling_logp_difference/mean": 0.052999142557382584, "step": 373 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.10324260219931602, "epoch": 0.9842105263157894, "grad_norm": 0.03451311215758324, "learning_rate": 1e-06, "loss": 0.0725, "step": 374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.10772082582116127, "epoch": 0.9868421052631579, "grad_norm": 0.01771300472319126, "learning_rate": 1e-06, "loss": 0.0278, "step": 375 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.10192153975367546, "epoch": 0.9894736842105263, "grad_norm": 0.05016673728823662, "learning_rate": 1e-06, "loss": 0.1221, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14736.0, "completions/mean_length": 1309.953125, "completions/mean_terminated_length": 792.2586059570312, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.11359592527151108, "epoch": 0.9921052631578947, "frac_reward_zero_std": 0.4375, "grad_norm": 0.02216515503823757, "learning_rate": 1e-06, "loss": 0.0448, "num_tokens": 135040714.0, "reward": 0.8058879971504211, "reward_std": 0.18076883256435394, "rewards/progression_diversity/mean": -0.0020200079306960106, "rewards/progression_diversity/std": 0.026723647490143776, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9161783456802368, "rewards/symbolic_reward_partial_score/std": 0.2642694115638733, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0174283981323242, "sampling/importance_sampling_ratio/min": 1.079955563909607e-05, "sampling/sampling_logp_difference/max": 11.436005592346191, "sampling/sampling_logp_difference/mean": 0.057007431983947754, "step": 377 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.1086348332464695, "epoch": 0.9947368421052631, "grad_norm": 0.022858861833810806, "learning_rate": 1e-06, "loss": 0.0659, "step": 378 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.11348432675004005, "epoch": 0.9973684210526316, "grad_norm": 0.01786152645945549, "learning_rate": 1e-06, "loss": 0.0745, "step": 379 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.1098298691213131, "epoch": 1.0, "grad_norm": 0.014867125079035759, "learning_rate": 1e-06, "loss": 0.118, "step": 380 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.015380859375, "eval_completions/max_length": 13116.78125, "eval_completions/max_terminated_length": 3658.3125, "eval_completions/mean_length": 877.506103515625, "eval_completions/mean_terminated_length": 635.8593416213989, "eval_completions/min_length": 274.03125, "eval_completions/min_terminated_length": 274.03125, "eval_entropy": 0.11584724555723369, "eval_frac_reward_zero_std": 0.50390625, "eval_loss": 0.018385088071227074, "eval_num_tokens": 135040714.0, "eval_reward": 0.8390934336930513, "eval_reward_std": 0.13885579153429717, "eval_rewards/progression_diversity/mean": -0.0005718442766351473, "eval_rewards/progression_diversity/std": 0.005817418514197925, "eval_rewards/symbolic_reward_accuracy/mean": 0.928466796875, "eval_rewards/symbolic_reward_accuracy/std": 0.24404766922816634, "eval_rewards/symbolic_reward_partial_score/mean": 0.9451904278248549, "eval_rewards/symbolic_reward_partial_score/std": 0.19986428710399196, "eval_rewards/tag_count_reward/mean": -0.015380859375, "eval_rewards/tag_count_reward/std": 0.10135847562924027, "eval_runtime": 3216.5627, "eval_samples_per_second": 0.078, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.020699668675661, "eval_sampling/importance_sampling_ratio/min": 0.0016229211131763817, "eval_sampling/sampling_logp_difference/max": 8.347711205482483, "eval_sampling/sampling_logp_difference/mean": 0.06724433228373528, "eval_steps_per_second": 0.001, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5424.0, "completions/mean_length": 1000.6953125, "completions/mean_terminated_length": 756.5159301757812, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.11362230032682419, "epoch": 1.0026315789473683, "frac_reward_zero_std": 0.40625, "grad_norm": 0.04116159304976463, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 135975406.0, "reward": 0.8353489637374878, "reward_std": 0.15797469019889832, "rewards/progression_diversity/mean": -0.0002593405661173165, "rewards/progression_diversity/std": 0.005868207197636366, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9459635019302368, "rewards/symbolic_reward_partial_score/std": 0.20843014121055603, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0185670852661133, "sampling/importance_sampling_ratio/min": 5.377536217565648e-05, "sampling/sampling_logp_difference/max": 9.830695152282715, "sampling/sampling_logp_difference/mean": 0.060036540031433105, "step": 381 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11580048501491547, "epoch": 1.0052631578947369, "grad_norm": 0.039675887674093246, "learning_rate": 1e-06, "loss": 0.0341, "step": 382 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11673459783196449, "epoch": 1.0078947368421052, "grad_norm": 0.022269433364272118, "learning_rate": 1e-06, "loss": 0.0286, "step": 383 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.10907693952322006, "epoch": 1.0105263157894737, "grad_norm": 0.01744413562119007, "learning_rate": 1e-06, "loss": 0.0408, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7503.0, "completions/mean_length": 1365.107421875, "completions/mean_terminated_length": 817.8603515625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11244688928127289, "epoch": 1.013157894736842, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04786130413413048, "learning_rate": 1e-06, "loss": 0.0923, "num_tokens": 137089157.0, "reward": 0.7905688285827637, "reward_std": 0.1710084080696106, "rewards/progression_diversity/mean": -0.0007375068962574005, "rewards/progression_diversity/std": 0.00985936913639307, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.92431640625, "rewards/symbolic_reward_partial_score/std": 0.22582882642745972, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0192362070083618, "sampling/importance_sampling_ratio/min": 2.148686326108873e-05, "sampling/sampling_logp_difference/max": 10.748068809509277, "sampling/sampling_logp_difference/mean": 0.05868230015039444, "step": 385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.11678306013345718, "epoch": 1.0157894736842106, "grad_norm": 0.017300132662057877, "learning_rate": 1e-06, "loss": 0.0402, "step": 386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.11940395832061768, "epoch": 1.018421052631579, "grad_norm": 0.01427427213639021, "learning_rate": 1e-06, "loss": 0.0228, "step": 387 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11587749421596527, "epoch": 1.0210526315789474, "grad_norm": 0.03753054514527321, "learning_rate": 1e-06, "loss": 0.0614, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10616.0, "completions/mean_length": 1142.22265625, "completions/mean_terminated_length": 745.1422729492188, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.1159932017326355, "epoch": 1.0236842105263158, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03958291932940483, "learning_rate": 1e-06, "loss": 0.0994, "num_tokens": 138065431.0, "reward": 0.8293313384056091, "reward_std": 0.15930168330669403, "rewards/progression_diversity/mean": -0.001442349050194025, "rewards/progression_diversity/std": 0.016170360147953033, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.94091796875, "rewards/symbolic_reward_partial_score/std": 0.2196962684392929, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0209177732467651, "sampling/importance_sampling_ratio/min": 8.514979299434344e-07, "sampling/sampling_logp_difference/max": 13.976268768310547, "sampling/sampling_logp_difference/mean": 0.06216352805495262, "step": 389 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.12261833995580673, "epoch": 1.0263157894736843, "grad_norm": 0.025751272216439247, "learning_rate": 1e-06, "loss": 0.0612, "step": 390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.12619221955537796, "epoch": 1.0289473684210526, "grad_norm": 0.020393963903188705, "learning_rate": 1e-06, "loss": 0.0126, "step": 391 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.1277061551809311, "epoch": 1.0315789473684212, "grad_norm": 0.02457948587834835, "learning_rate": 1e-06, "loss": 0.0283, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6283.0, "completions/mean_length": 1035.68359375, "completions/mean_terminated_length": 761.0615844726562, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.12381928041577339, "epoch": 1.0342105263157895, "frac_reward_zero_std": 0.53125, "grad_norm": 0.0314381942152977, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 138999509.0, "reward": 0.8199691772460938, "reward_std": 0.13415905833244324, "rewards/progression_diversity/mean": -0.00015412273933179677, "rewards/progression_diversity/std": 0.0034873993135988712, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.9461262822151184, "rewards/symbolic_reward_partial_score/std": 0.19489480555057526, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0213356018066406, "sampling/importance_sampling_ratio/min": 5.044097406425863e-07, "sampling/sampling_logp_difference/max": 14.499876976013184, "sampling/sampling_logp_difference/mean": 0.06386812776327133, "step": 393 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.11941304057836533, "epoch": 1.0368421052631578, "grad_norm": 0.007009011693298817, "learning_rate": 1e-06, "loss": 0.0269, "step": 394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.1252107173204422, "epoch": 1.0394736842105263, "grad_norm": 0.018644485622644424, "learning_rate": 1e-06, "loss": 0.0083, "step": 395 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.12022094428539276, "epoch": 1.0421052631578946, "grad_norm": 0.008900588378310204, "learning_rate": 1e-06, "loss": 0.0657, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5305.0, "completions/mean_length": 1517.716796875, "completions/mean_terminated_length": 690.1093139648438, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.11808853596448898, "epoch": 1.0447368421052632, "frac_reward_zero_std": 0.53125, "grad_norm": 0.033308856189250946, "learning_rate": 1e-06, "loss": 0.0487, "num_tokens": 140177540.0, "reward": 0.805396318435669, "reward_std": 0.14244824647903442, "rewards/progression_diversity/mean": -0.002365510445088148, "rewards/progression_diversity/std": 0.032508544623851776, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9210612177848816, "rewards/symbolic_reward_partial_score/std": 0.248762309551239, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0201090574264526, "sampling/importance_sampling_ratio/min": 1.2532452728919452e-06, "sampling/sampling_logp_difference/max": 13.589774131774902, "sampling/sampling_logp_difference/mean": 0.05805087089538574, "step": 397 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11611740663647652, "epoch": 1.0473684210526315, "grad_norm": 0.013962333090603352, "learning_rate": 1e-06, "loss": 0.0597, "step": 398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.1225820817053318, "epoch": 1.05, "grad_norm": 0.023140504956245422, "learning_rate": 1e-06, "loss": 0.0526, "step": 399 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.12173304334282875, "epoch": 1.0526315789473684, "grad_norm": 0.014247185550630093, "learning_rate": 1e-06, "loss": 0.0829, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9002.0, "completions/mean_length": 1278.06640625, "completions/mean_terminated_length": 664.0040283203125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.12073200196027756, "epoch": 1.055263157894737, "frac_reward_zero_std": 0.53125, "grad_norm": 0.030312160030007362, "learning_rate": 1e-06, "loss": 0.0684, "num_tokens": 141226214.0, "reward": 0.8194763660430908, "reward_std": 0.1385163962841034, "rewards/progression_diversity/mean": -0.0006138992612250149, "rewards/progression_diversity/std": 0.008086828514933586, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.93603515625, "rewards/symbolic_reward_partial_score/std": 0.22104382514953613, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0207290649414062, "sampling/importance_sampling_ratio/min": 0.00014015565102454275, "sampling/sampling_logp_difference/max": 8.872756958007812, "sampling/sampling_logp_difference/mean": 0.06024109199643135, "step": 401 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.1248621977865696, "epoch": 1.0578947368421052, "grad_norm": 0.010959668084979057, "learning_rate": 1e-06, "loss": 0.0659, "step": 402 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11883337050676346, "epoch": 1.0605263157894738, "grad_norm": 0.024657705798745155, "learning_rate": 1e-06, "loss": 0.0655, "step": 403 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.11919617652893066, "epoch": 1.063157894736842, "grad_norm": 0.01765652373433113, "learning_rate": 1e-06, "loss": 0.0271, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7545.0, "completions/mean_length": 1404.330078125, "completions/mean_terminated_length": 763.6517944335938, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.11879817768931389, "epoch": 1.0657894736842106, "frac_reward_zero_std": 0.5, "grad_norm": 0.012826770544052124, "learning_rate": 1e-06, "loss": 0.0739, "num_tokens": 142359887.0, "reward": 0.8211853504180908, "reward_std": 0.14381209015846252, "rewards/progression_diversity/mean": -0.000610048184171319, "rewards/progression_diversity/std": 0.00695717241615057, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9345703125, "rewards/symbolic_reward_partial_score/std": 0.22934241592884064, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0197585821151733, "sampling/importance_sampling_ratio/min": 4.271742568562331e-07, "sampling/sampling_logp_difference/max": 14.6660737991333, "sampling/sampling_logp_difference/mean": 0.058111920952796936, "step": 405 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.112503781914711, "epoch": 1.068421052631579, "grad_norm": 0.022235997021198273, "learning_rate": 1e-06, "loss": 0.0677, "step": 406 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11570753157138824, "epoch": 1.0710526315789473, "grad_norm": 0.01149500161409378, "learning_rate": 1e-06, "loss": 0.0595, "step": 407 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.11613816022872925, "epoch": 1.0736842105263158, "grad_norm": 0.015948958694934845, "learning_rate": 1e-06, "loss": 0.0366, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4484.0, "completions/mean_length": 1437.271484375, "completions/mean_terminated_length": 734.255615234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.11242419108748436, "epoch": 1.0763157894736841, "frac_reward_zero_std": 0.46875, "grad_norm": 0.037394747138023376, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 143501082.0, "reward": 0.8034499883651733, "reward_std": 0.14993996918201447, "rewards/progression_diversity/mean": -0.0016794700641185045, "rewards/progression_diversity/std": 0.01779768615961075, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9236653447151184, "rewards/symbolic_reward_partial_score/std": 0.24423635005950928, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0196541547775269, "sampling/importance_sampling_ratio/min": 5.551847425522283e-05, "sampling/sampling_logp_difference/max": 9.798794746398926, "sampling/sampling_logp_difference/mean": 0.05961894989013672, "step": 409 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.10686653479933739, "epoch": 1.0789473684210527, "grad_norm": 0.03459122031927109, "learning_rate": 1e-06, "loss": 0.1115, "step": 410 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.10253704711794853, "epoch": 1.081578947368421, "grad_norm": 0.017048373818397522, "learning_rate": 1e-06, "loss": 0.0259, "step": 411 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.0990813598036766, "epoch": 1.0842105263157895, "grad_norm": 0.03026195615530014, "learning_rate": 1e-06, "loss": 0.0124, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4737.0, "completions/mean_length": 1033.18359375, "completions/mean_terminated_length": 664.7640380859375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.10128960758447647, "epoch": 1.0868421052631578, "frac_reward_zero_std": 0.375, "grad_norm": 0.029587438330054283, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 144440376.0, "reward": 0.8033533096313477, "reward_std": 0.17567187547683716, "rewards/progression_diversity/mean": -0.0015859488630667329, "rewards/progression_diversity/std": 0.02551618218421936, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9318033456802368, "rewards/symbolic_reward_partial_score/std": 0.2252691388130188, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0161960124969482, "sampling/importance_sampling_ratio/min": 1.3736066648561973e-06, "sampling/sampling_logp_difference/max": 13.49807071685791, "sampling/sampling_logp_difference/mean": 0.05791211128234863, "step": 413 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.09996145218610764, "epoch": 1.0894736842105264, "grad_norm": 0.013834556564688683, "learning_rate": 1e-06, "loss": 0.0171, "step": 414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.10281230881810188, "epoch": 1.0921052631578947, "grad_norm": 0.02543732523918152, "learning_rate": 1e-06, "loss": 0.0327, "step": 415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.09636327251791954, "epoch": 1.0947368421052632, "grad_norm": 0.008221838623285294, "learning_rate": 1e-06, "loss": 0.0774, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5506.0, "completions/mean_length": 788.818359375, "completions/mean_terminated_length": 666.0216674804688, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.1014927513897419, "epoch": 1.0973684210526315, "frac_reward_zero_std": 0.5, "grad_norm": 0.01914730854332447, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 145242203.0, "reward": 0.8517071008682251, "reward_std": 0.12221311032772064, "rewards/progression_diversity/mean": -0.00019061629427596927, "rewards/progression_diversity/std": 0.004313154611736536, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.9666340947151184, "rewards/symbolic_reward_partial_score/std": 0.15885646641254425, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.017521858215332, "sampling/importance_sampling_ratio/min": 4.664206699089846e-06, "sampling/sampling_logp_difference/max": 12.275592803955078, "sampling/sampling_logp_difference/mean": 0.060040805488824844, "step": 417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.10259632021188736, "epoch": 1.1, "grad_norm": 0.03230629488825798, "learning_rate": 1e-06, "loss": 0.0411, "step": 418 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.10374432802200317, "epoch": 1.1026315789473684, "grad_norm": 0.029180046170949936, "learning_rate": 1e-06, "loss": 0.0381, "step": 419 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.10562139376997948, "epoch": 1.1052631578947367, "grad_norm": 0.005970039404928684, "learning_rate": 1e-06, "loss": 0.0082, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7054.0, "completions/mean_length": 1033.38671875, "completions/mean_terminated_length": 727.59765625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.10337032377719879, "epoch": 1.1078947368421053, "frac_reward_zero_std": 0.5, "grad_norm": 0.016906937584280968, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 146154753.0, "reward": 0.8127256631851196, "reward_std": 0.16144107282161713, "rewards/progression_diversity/mean": -0.0018486212939023972, "rewards/progression_diversity/std": 0.030813097953796387, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.9226887822151184, "rewards/symbolic_reward_partial_score/std": 0.25257766246795654, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0161592960357666, "sampling/importance_sampling_ratio/min": 5.162914021639153e-06, "sampling/sampling_logp_difference/max": 12.174009323120117, "sampling/sampling_logp_difference/mean": 0.05756688863039017, "step": 421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.09895607084035873, "epoch": 1.1105263157894736, "grad_norm": 0.04962952807545662, "learning_rate": 1e-06, "loss": 0.0424, "step": 422 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.10057874768972397, "epoch": 1.1131578947368421, "grad_norm": 0.015979913994669914, "learning_rate": 1e-06, "loss": 0.0597, "step": 423 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.10110774263739586, "epoch": 1.1157894736842104, "grad_norm": 0.0167376846075058, "learning_rate": 1e-06, "loss": 0.0414, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5384.0, "completions/mean_length": 802.5234375, "completions/mean_terminated_length": 741.419677734375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.09795539081096649, "epoch": 1.118421052631579, "frac_reward_zero_std": 0.375, "grad_norm": 0.01597507670521736, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 146970797.0, "reward": 0.8327491879463196, "reward_std": 0.15943855047225952, "rewards/progression_diversity/mean": -0.0014538828982040286, "rewards/progression_diversity/std": 0.03289761394262314, "rewards/symbolic_reward_accuracy/mean": 0.912109375, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.9529622197151184, "rewards/symbolic_reward_partial_score/std": 0.18036773800849915, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0156102180480957, "sampling/importance_sampling_ratio/min": 2.9369621188379824e-05, "sampling/sampling_logp_difference/max": 10.43554973602295, "sampling/sampling_logp_difference/mean": 0.06037828326225281, "step": 425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.09897046536207199, "epoch": 1.1210526315789473, "grad_norm": 0.034385696053504944, "learning_rate": 1e-06, "loss": 0.0032, "step": 426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.09649962186813354, "epoch": 1.1236842105263158, "grad_norm": 0.01485302671790123, "learning_rate": 1e-06, "loss": 0.0068, "step": 427 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.09620106220245361, "epoch": 1.1263157894736842, "grad_norm": 0.019307559356093407, "learning_rate": 1e-06, "loss": 0.0451, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7232.0, "completions/mean_length": 914.90234375, "completions/mean_terminated_length": 731.474365234375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09814492240548134, "epoch": 1.1289473684210527, "frac_reward_zero_std": 0.5625, "grad_norm": 0.021051058545708656, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 147822843.0, "reward": 0.8289062976837158, "reward_std": 0.11690939962863922, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9466145634651184, "rewards/symbolic_reward_partial_score/std": 0.19831162691116333, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0156972408294678, "sampling/importance_sampling_ratio/min": 4.255066414771136e-06, "sampling/sampling_logp_difference/max": 12.367400169372559, "sampling/sampling_logp_difference/mean": 0.06155911460518837, "step": 429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.10168719664216042, "epoch": 1.131578947368421, "grad_norm": 0.02653714269399643, "learning_rate": 1e-06, "loss": 0.0208, "step": 430 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.09842254221439362, "epoch": 1.1342105263157896, "grad_norm": 0.010419169440865517, "learning_rate": 1e-06, "loss": 0.0184, "step": 431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.09445172548294067, "epoch": 1.1368421052631579, "grad_norm": 0.016496408730745316, "learning_rate": 1e-06, "loss": 0.0163, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4974.0, "completions/mean_length": 910.11328125, "completions/mean_terminated_length": 695.623779296875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.09750872850418091, "epoch": 1.1394736842105262, "frac_reward_zero_std": 0.625, "grad_norm": 0.01833958737552166, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 148674101.0, "reward": 0.8388184309005737, "reward_std": 0.0944415032863617, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.919921875, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.9607747197151184, "rewards/symbolic_reward_partial_score/std": 0.16794845461845398, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0153504610061646, "sampling/importance_sampling_ratio/min": 1.6851483541913126e-09, "sampling/sampling_logp_difference/max": 20.201412200927734, "sampling/sampling_logp_difference/mean": 0.05861156806349754, "step": 433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.09329849109053612, "epoch": 1.1421052631578947, "grad_norm": 0.045422203838825226, "learning_rate": 1e-06, "loss": 0.0453, "step": 434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.09762432053685188, "epoch": 1.1447368421052633, "grad_norm": 0.014451306313276291, "learning_rate": 1e-06, "loss": 0.0188, "step": 435 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.09433042258024216, "epoch": 1.1473684210526316, "grad_norm": 0.01592213101685047, "learning_rate": 1e-06, "loss": 0.0077, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5906.0, "completions/mean_length": 745.5859375, "completions/mean_terminated_length": 684.2588500976562, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.09743008762598038, "epoch": 1.15, "frac_reward_zero_std": 0.65625, "grad_norm": 0.027467206120491028, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 149445473.0, "reward": 0.8571289777755737, "reward_std": 0.09262826293706894, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9453125, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.9677734375, "rewards/symbolic_reward_partial_score/std": 0.1515229344367981, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0158276557922363, "sampling/importance_sampling_ratio/min": 4.494921683228018e-16, "sampling/sampling_logp_difference/max": 35.33841323852539, "sampling/sampling_logp_difference/mean": 0.06052025035023689, "step": 437 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.09928737580776215, "epoch": 1.1526315789473685, "grad_norm": 0.026970867067575455, "learning_rate": 1e-06, "loss": 0.0147, "step": 438 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.09795548766851425, "epoch": 1.1552631578947368, "grad_norm": 0.014579696580767632, "learning_rate": 1e-06, "loss": 0.0073, "step": 439 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.10043598338961601, "epoch": 1.1578947368421053, "grad_norm": 0.00749570457264781, "learning_rate": 1e-06, "loss": 0.019, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5394.0, "completions/mean_length": 1239.98828125, "completions/mean_terminated_length": 845.4548950195312, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.0946054458618164, "epoch": 1.1605263157894736, "frac_reward_zero_std": 0.375, "grad_norm": 0.030099380761384964, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 150507195.0, "reward": 0.7865234613418579, "reward_std": 0.19104072451591492, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9186197519302368, "rewards/symbolic_reward_partial_score/std": 0.2408839911222458, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0148797035217285, "sampling/importance_sampling_ratio/min": 3.064930638174701e-07, "sampling/sampling_logp_difference/max": 14.99807071685791, "sampling/sampling_logp_difference/mean": 0.05635258927941322, "step": 441 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.0940646082162857, "epoch": 1.1631578947368422, "grad_norm": 0.03301357850432396, "learning_rate": 1e-06, "loss": 0.0389, "step": 442 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.09547659382224083, "epoch": 1.1657894736842105, "grad_norm": 0.023627353832125664, "learning_rate": 1e-06, "loss": 0.0095, "step": 443 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.0967930480837822, "epoch": 1.168421052631579, "grad_norm": 0.028321361169219017, "learning_rate": 1e-06, "loss": 0.0502, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9650.0, "completions/mean_length": 1090.6171875, "completions/mean_terminated_length": 878.6297607421875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.10013857111334801, "epoch": 1.1710526315789473, "frac_reward_zero_std": 0.375, "grad_norm": 0.04974789917469025, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 151460055.0, "reward": 0.800537109375, "reward_std": 0.16888116300106049, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.873046875, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.92626953125, "rewards/symbolic_reward_partial_score/std": 0.22773192822933197, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.016984224319458, "sampling/importance_sampling_ratio/min": 2.9851989324924944e-07, "sampling/sampling_logp_difference/max": 15.024429321289062, "sampling/sampling_logp_difference/mean": 0.059770140796899796, "step": 445 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.09959771484136581, "epoch": 1.1736842105263159, "grad_norm": 0.017701053991913795, "learning_rate": 1e-06, "loss": 0.0206, "step": 446 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.10232729837298393, "epoch": 1.1763157894736842, "grad_norm": 0.02004496566951275, "learning_rate": 1e-06, "loss": 0.0277, "step": 447 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.10072638839483261, "epoch": 1.1789473684210527, "grad_norm": 0.018607337027788162, "learning_rate": 1e-06, "loss": 0.031, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6300.0, "completions/mean_length": 931.9296875, "completions/mean_terminated_length": 840.8566284179688, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.10329998284578323, "epoch": 1.181578947368421, "frac_reward_zero_std": 0.46875, "grad_norm": 0.039218734949827194, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 152344499.0, "reward": 0.8318839073181152, "reward_std": 0.14675593376159668, "rewards/progression_diversity/mean": -8.955165685620159e-05, "rewards/progression_diversity/std": 0.002026322763413191, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.94287109375, "rewards/symbolic_reward_partial_score/std": 0.2136351764202118, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0176771879196167, "sampling/importance_sampling_ratio/min": 6.2567501117882784e-06, "sampling/sampling_logp_difference/max": 11.981849670410156, "sampling/sampling_logp_difference/mean": 0.06110313534736633, "step": 449 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.10243717581033707, "epoch": 1.1842105263157894, "grad_norm": 0.02117745950818062, "learning_rate": 1e-06, "loss": 0.021, "step": 450 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.10411274060606956, "epoch": 1.186842105263158, "grad_norm": 0.02108968421816826, "learning_rate": 1e-06, "loss": 0.0114, "step": 451 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.10935872420668602, "epoch": 1.1894736842105262, "grad_norm": 0.007527265697717667, "learning_rate": 1e-06, "loss": 0.0099, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7876.0, "completions/max_terminated_length": 7876.0, "completions/mean_length": 748.26171875, "completions/mean_terminated_length": 748.26171875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.10911908373236656, "epoch": 1.1921052631578948, "frac_reward_zero_std": 0.5, "grad_norm": 0.029693739488720894, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 153135929.0, "reward": 0.8560059070587158, "reward_std": 0.12228970229625702, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.9666340947151184, "rewards/symbolic_reward_partial_score/std": 0.15885646641254425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0188539028167725, "sampling/importance_sampling_ratio/min": 1.5778656234033406e-05, "sampling/sampling_logp_difference/max": 11.056852340698242, "sampling/sampling_logp_difference/mean": 0.06543545424938202, "step": 453 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.11097723618149757, "epoch": 1.194736842105263, "grad_norm": 0.019065558910369873, "learning_rate": 1e-06, "loss": -0.0018, "step": 454 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.11221178621053696, "epoch": 1.1973684210526316, "grad_norm": 0.019534561783075333, "learning_rate": 1e-06, "loss": -0.0013, "step": 455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.10779605805873871, "epoch": 1.2, "grad_norm": 0.009235396981239319, "learning_rate": 1e-06, "loss": 0.0147, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6720.0, "completions/mean_length": 898.009765625, "completions/mean_terminated_length": 745.2879638671875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11065573990345001, "epoch": 1.2026315789473685, "frac_reward_zero_std": 0.40625, "grad_norm": 0.012675322592258453, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 154012350.0, "reward": 0.8206509947776794, "reward_std": 0.14962440729141235, "rewards/progression_diversity/mean": -0.00033113209065049887, "rewards/progression_diversity/std": 0.0074926638044416904, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.94580078125, "rewards/symbolic_reward_partial_score/std": 0.1969550997018814, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0194942951202393, "sampling/importance_sampling_ratio/min": 1.5589863266995962e-07, "sampling/sampling_logp_difference/max": 15.674059867858887, "sampling/sampling_logp_difference/mean": 0.061966508626937866, "step": 457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.10747477412223816, "epoch": 1.2052631578947368, "grad_norm": 0.023037495091557503, "learning_rate": 1e-06, "loss": 0.0034, "step": 458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.10901957005262375, "epoch": 1.2078947368421054, "grad_norm": 0.032999638468027115, "learning_rate": 1e-06, "loss": 0.039, "step": 459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.1066613681614399, "epoch": 1.2105263157894737, "grad_norm": 0.0317135825753212, "learning_rate": 1e-06, "loss": 0.0266, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11462.0, "completions/mean_length": 1336.01953125, "completions/mean_terminated_length": 912.98388671875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.11170916259288788, "epoch": 1.2131578947368422, "frac_reward_zero_std": 0.5625, "grad_norm": 0.027878640219569206, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 155114376.0, "reward": 0.7967281341552734, "reward_std": 0.12850847840309143, "rewards/progression_diversity/mean": -4.386279761092737e-05, "rewards/progression_diversity/std": 0.0009925017366185784, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9148762822151184, "rewards/symbolic_reward_partial_score/std": 0.25462573766708374, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0198333263397217, "sampling/importance_sampling_ratio/min": 3.876455139106838e-06, "sampling/sampling_logp_difference/max": 12.460589408874512, "sampling/sampling_logp_difference/mean": 0.06402070820331573, "step": 461 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.11339271068572998, "epoch": 1.2157894736842105, "grad_norm": 0.02045866660773754, "learning_rate": 1e-06, "loss": 0.0186, "step": 462 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.11221589148044586, "epoch": 1.2184210526315788, "grad_norm": 0.011421293020248413, "learning_rate": 1e-06, "loss": 0.0203, "step": 463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.11195248365402222, "epoch": 1.2210526315789474, "grad_norm": 0.016839897260069847, "learning_rate": 1e-06, "loss": 0.0322, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7398.0, "completions/mean_length": 822.09375, "completions/mean_terminated_length": 761.0667114257812, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.11619088798761368, "epoch": 1.2236842105263157, "frac_reward_zero_std": 0.40625, "grad_norm": 0.045058924704790115, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 155939256.0, "reward": 0.8314453363418579, "reward_std": 0.14568641781806946, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9524739384651184, "rewards/symbolic_reward_partial_score/std": 0.1864290088415146, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0201449394226074, "sampling/importance_sampling_ratio/min": 7.38695504765019e-08, "sampling/sampling_logp_difference/max": 16.42096519470215, "sampling/sampling_logp_difference/mean": 0.06559212505817413, "step": 465 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.11323034018278122, "epoch": 1.2263157894736842, "grad_norm": 0.018060827627778053, "learning_rate": 1e-06, "loss": 0.0557, "step": 466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.11986911296844482, "epoch": 1.2289473684210526, "grad_norm": 0.006362342741340399, "learning_rate": 1e-06, "loss": 0.0211, "step": 467 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.12106319144368172, "epoch": 1.231578947368421, "grad_norm": 0.0218115895986557, "learning_rate": 1e-06, "loss": 0.0058, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12412.0, "completions/mean_length": 1080.7890625, "completions/mean_terminated_length": 899.328125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.12077518552541733, "epoch": 1.2342105263157894, "frac_reward_zero_std": 0.375, "grad_norm": 0.023888949304819107, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 156925804.0, "reward": 0.8068230152130127, "reward_std": 0.18419486284255981, "rewards/progression_diversity/mean": -0.0012958223232999444, "rewards/progression_diversity/std": 0.029321111738681793, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9231771230697632, "rewards/symbolic_reward_partial_score/std": 0.24108576774597168, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0196768045425415, "sampling/importance_sampling_ratio/min": 8.88462352577335e-08, "sampling/sampling_logp_difference/max": 16.236358642578125, "sampling/sampling_logp_difference/mean": 0.0635080635547638, "step": 469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.11772556602954865, "epoch": 1.236842105263158, "grad_norm": 0.019387010484933853, "learning_rate": 1e-06, "loss": 0.0357, "step": 470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.11631882935762405, "epoch": 1.2394736842105263, "grad_norm": 0.02116510644555092, "learning_rate": 1e-06, "loss": 0.0205, "step": 471 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.11524809151887894, "epoch": 1.2421052631578948, "grad_norm": 0.017463896423578262, "learning_rate": 1e-06, "loss": 0.0445, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7361.0, "completions/mean_length": 927.275390625, "completions/mean_terminated_length": 805.5689086914062, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.12217172235250473, "epoch": 1.2447368421052631, "frac_reward_zero_std": 0.4375, "grad_norm": 0.030732987448573112, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 157807385.0, "reward": 0.8150861263275146, "reward_std": 0.15566140413284302, "rewards/progression_diversity/mean": -0.0001790223177522421, "rewards/progression_diversity/std": 0.004050812683999538, "rewards/symbolic_reward_accuracy/mean": 0.888671875, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.9422200322151184, "rewards/symbolic_reward_partial_score/std": 0.19841378927230835, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0209197998046875, "sampling/importance_sampling_ratio/min": 4.93685820401879e-06, "sampling/sampling_logp_difference/max": 12.218781471252441, "sampling/sampling_logp_difference/mean": 0.06566841155290604, "step": 473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.12189845740795135, "epoch": 1.2473684210526317, "grad_norm": 0.023981690406799316, "learning_rate": 1e-06, "loss": 0.03, "step": 474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.1238529235124588, "epoch": 1.25, "grad_norm": 0.02387721836566925, "learning_rate": 1e-06, "loss": 0.0418, "step": 475 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.12588384374976158, "epoch": 1.2526315789473683, "grad_norm": 0.021237516775727272, "learning_rate": 1e-06, "loss": 0.007, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9261.0, "completions/mean_length": 899.77734375, "completions/mean_terminated_length": 716.1699829101562, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.13065219670534134, "epoch": 1.2552631578947369, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0358455665409565, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 158659847.0, "reward": 0.8351035714149475, "reward_std": 0.10815407335758209, "rewards/progression_diversity/mean": -0.0003900247684214264, "rewards/progression_diversity/std": 0.008825253695249557, "rewards/symbolic_reward_accuracy/mean": 0.91796875, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, "rewards/symbolic_reward_partial_score/std": 0.1945120096206665, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0229235887527466, "sampling/importance_sampling_ratio/min": 4.0339618863072246e-05, "sampling/sampling_logp_difference/max": 10.118176460266113, "sampling/sampling_logp_difference/mean": 0.06817199289798737, "step": 477 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.13009684532880783, "epoch": 1.2578947368421054, "grad_norm": 0.017281439155340195, "learning_rate": 1e-06, "loss": 0.0275, "step": 478 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.12655635178089142, "epoch": 1.2605263157894737, "grad_norm": 0.011967520229518414, "learning_rate": 1e-06, "loss": 0.0433, "step": 479 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.13125859946012497, "epoch": 1.263157894736842, "grad_norm": 0.010364379733800888, "learning_rate": 1e-06, "loss": 0.0029, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11268.0, "completions/mean_length": 1359.4765625, "completions/mean_terminated_length": 780.4381103515625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.123091921210289, "epoch": 1.2657894736842106, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03727136552333832, "learning_rate": 1e-06, "loss": 0.0505, "num_tokens": 159759867.0, "reward": 0.8022311925888062, "reward_std": 0.1121816635131836, "rewards/progression_diversity/mean": -0.001497793011367321, "rewards/progression_diversity/std": 0.01563715748488903, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9208984375, "rewards/symbolic_reward_partial_score/std": 0.2492290735244751, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0214684009552002, "sampling/importance_sampling_ratio/min": 1.330551033934535e-11, "sampling/sampling_logp_difference/max": 25.042842864990234, "sampling/sampling_logp_difference/mean": 0.06472167372703552, "step": 481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.12072613090276718, "epoch": 1.268421052631579, "grad_norm": 0.022708555683493614, "learning_rate": 1e-06, "loss": 0.0388, "step": 482 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.12625328078866005, "epoch": 1.2710526315789474, "grad_norm": 0.021841494366526604, "learning_rate": 1e-06, "loss": 0.0393, "step": 483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.12154950946569443, "epoch": 1.2736842105263158, "grad_norm": 0.009138455614447594, "learning_rate": 1e-06, "loss": 0.0245, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4483.0, "completions/mean_length": 1131.763671875, "completions/mean_terminated_length": 671.4345703125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.12409983202815056, "epoch": 1.2763157894736843, "frac_reward_zero_std": 0.71875, "grad_norm": 0.025687798857688904, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 160721602.0, "reward": 0.8422271013259888, "reward_std": 0.07236142456531525, "rewards/progression_diversity/mean": -0.0009313340415246785, "rewards/progression_diversity/std": 0.01315159909427166, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9539387822151184, "rewards/symbolic_reward_partial_score/std": 0.19256454706192017, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0208799839019775, "sampling/importance_sampling_ratio/min": 0.000312736548949033, "sampling/sampling_logp_difference/max": 8.070149421691895, "sampling/sampling_logp_difference/mean": 0.06437121331691742, "step": 485 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.11737996339797974, "epoch": 1.2789473684210526, "grad_norm": 0.024496793746948242, "learning_rate": 1e-06, "loss": 0.0558, "step": 486 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.11956480145454407, "epoch": 1.2815789473684212, "grad_norm": 0.010835636407136917, "learning_rate": 1e-06, "loss": 0.0307, "step": 487 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.1189895048737526, "epoch": 1.2842105263157895, "grad_norm": 0.011718123219907284, "learning_rate": 1e-06, "loss": 0.0353, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5296.0, "completions/mean_length": 1337.478515625, "completions/mean_terminated_length": 789.2247314453125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.11463934555649757, "epoch": 1.2868421052631578, "frac_reward_zero_std": 0.34375, "grad_norm": 0.04130804166197777, "learning_rate": 1e-06, "loss": 0.0829, "num_tokens": 161806871.0, "reward": 0.7914862632751465, "reward_std": 0.1913241147994995, "rewards/progression_diversity/mean": -0.0017659981967881322, "rewards/progression_diversity/std": 0.017526477575302124, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.9156900644302368, "rewards/symbolic_reward_partial_score/std": 0.2554028630256653, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.018146276473999, "sampling/importance_sampling_ratio/min": 1.5589847635055776e-07, "sampling/sampling_logp_difference/max": 15.674060821533203, "sampling/sampling_logp_difference/mean": 0.06120399013161659, "step": 489 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.11538948118686676, "epoch": 1.2894736842105263, "grad_norm": 0.013879453763365746, "learning_rate": 1e-06, "loss": 0.0104, "step": 490 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.11806956678628922, "epoch": 1.2921052631578949, "grad_norm": 0.012860557064414024, "learning_rate": 1e-06, "loss": 0.0499, "step": 491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.1125846691429615, "epoch": 1.2947368421052632, "grad_norm": 0.02000536397099495, "learning_rate": 1e-06, "loss": 0.0199, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 8706.0, "completions/mean_length": 924.626953125, "completions/mean_terminated_length": 741.3142700195312, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.1223670057952404, "epoch": 1.2973684210526315, "frac_reward_zero_std": 0.5, "grad_norm": 0.014644470997154713, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 162687256.0, "reward": 0.8528740406036377, "reward_std": 0.12011945247650146, "rewards/progression_diversity/mean": -0.000687220657709986, "rewards/progression_diversity/std": 0.013531150296330452, "rewards/symbolic_reward_accuracy/mean": 0.94140625, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.96337890625, "rewards/symbolic_reward_partial_score/std": 0.17117686569690704, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0194642543792725, "sampling/importance_sampling_ratio/min": 2.044983940790368e-21, "sampling/sampling_logp_difference/max": 47.63889694213867, "sampling/sampling_logp_difference/mean": 0.06407853960990906, "step": 493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.1194252297282219, "epoch": 1.3, "grad_norm": 0.00824655033648014, "learning_rate": 1e-06, "loss": 0.033, "step": 494 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11969351023435593, "epoch": 1.3026315789473684, "grad_norm": 0.05345854163169861, "learning_rate": 1e-06, "loss": 0.0245, "step": 495 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.11757103353738785, "epoch": 1.305263157894737, "grad_norm": 0.00989691074937582, "learning_rate": 1e-06, "loss": -0.0039, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9763.0, "completions/mean_length": 871.619140625, "completions/mean_terminated_length": 780.1906127929688, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.1206367239356041, "epoch": 1.3078947368421052, "frac_reward_zero_std": 0.46875, "grad_norm": 0.048302192240953445, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 163535829.0, "reward": 0.830558180809021, "reward_std": 0.14636383950710297, "rewards/progression_diversity/mean": -0.000826410308945924, "rewards/progression_diversity/std": 0.01099415123462677, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9501953125, "rewards/symbolic_reward_partial_score/std": 0.1905975341796875, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.020845651626587, "sampling/importance_sampling_ratio/min": 1.0631980984499023e-07, "sampling/sampling_logp_difference/max": 16.056814193725586, "sampling/sampling_logp_difference/mean": 0.06594814360141754, "step": 497 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.1199469231069088, "epoch": 1.3105263157894738, "grad_norm": 0.013368850573897362, "learning_rate": 1e-06, "loss": 0.0202, "step": 498 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.12389621138572693, "epoch": 1.313157894736842, "grad_norm": 0.015475841239094734, "learning_rate": 1e-06, "loss": 0.0216, "step": 499 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.12212049216032028, "epoch": 1.3157894736842106, "grad_norm": 0.018125230446457863, "learning_rate": 1e-06, "loss": 0.0084, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6161.0, "completions/mean_length": 1061.85546875, "completions/mean_terminated_length": 725.4411010742188, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.12253501266241074, "epoch": 1.318421052631579, "frac_reward_zero_std": 0.53125, "grad_norm": 0.031911756843328476, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 164483307.0, "reward": 0.8256685137748718, "reward_std": 0.12521421909332275, "rewards/progression_diversity/mean": -0.0015109577216207981, "rewards/progression_diversity/std": 0.028086457401514053, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9423828125, "rewards/symbolic_reward_partial_score/std": 0.20720478892326355, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.019927740097046, "sampling/importance_sampling_ratio/min": 7.929816092655528e-06, "sampling/sampling_logp_difference/max": 11.744880676269531, "sampling/sampling_logp_difference/mean": 0.06401927769184113, "step": 501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11759228631854057, "epoch": 1.3210526315789473, "grad_norm": 0.14602532982826233, "learning_rate": 1e-06, "loss": 0.0659, "step": 502 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.12064041569828987, "epoch": 1.3236842105263158, "grad_norm": 0.01236443966627121, "learning_rate": 1e-06, "loss": -0.0013, "step": 503 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.12389854341745377, "epoch": 1.3263157894736843, "grad_norm": 0.015112272463738918, "learning_rate": 1e-06, "loss": -0.0001, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6364.0, "completions/mean_length": 1221.919921875, "completions/mean_terminated_length": 826.9158325195312, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.12016144394874573, "epoch": 1.3289473684210527, "frac_reward_zero_std": 0.46875, "grad_norm": 0.019961973652243614, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 165545602.0, "reward": 0.8171666860580444, "reward_std": 0.16274571418762207, "rewards/progression_diversity/mean": -0.0020828458946198225, "rewards/progression_diversity/std": 0.021263638511300087, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9231770634651184, "rewards/symbolic_reward_partial_score/std": 0.2538268566131592, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0189927816390991, "sampling/importance_sampling_ratio/min": 2.001787464678273e-07, "sampling/sampling_logp_difference/max": 15.424055099487305, "sampling/sampling_logp_difference/mean": 0.06025902181863785, "step": 505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.1265234798192978, "epoch": 1.331578947368421, "grad_norm": 0.014271781779825687, "learning_rate": 1e-06, "loss": 0.0147, "step": 506 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11439178138971329, "epoch": 1.3342105263157895, "grad_norm": 0.011770401149988174, "learning_rate": 1e-06, "loss": 0.0508, "step": 507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.11405204609036446, "epoch": 1.3368421052631578, "grad_norm": 0.013367000967264175, "learning_rate": 1e-06, "loss": 0.0757, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6573.0, "completions/mean_length": 1136.01171875, "completions/mean_terminated_length": 832.2669677734375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.11793714761734009, "epoch": 1.3394736842105264, "frac_reward_zero_std": 0.5, "grad_norm": 0.030582962557673454, "learning_rate": 1e-06, "loss": 0.0369, "num_tokens": 166528040.0, "reward": 0.8212710022926331, "reward_std": 0.13752031326293945, "rewards/progression_diversity/mean": -0.0018108001677319407, "rewards/progression_diversity/std": 0.021729890257120132, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.935546875, "rewards/symbolic_reward_partial_score/std": 0.22706012427806854, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0203887224197388, "sampling/importance_sampling_ratio/min": 2.9788969186483882e-05, "sampling/sampling_logp_difference/max": 10.421372413635254, "sampling/sampling_logp_difference/mean": 0.06702390313148499, "step": 509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.11901584640145302, "epoch": 1.3421052631578947, "grad_norm": 0.0400007963180542, "learning_rate": 1e-06, "loss": 0.0253, "step": 510 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.11924078688025475, "epoch": 1.3447368421052632, "grad_norm": 0.02401769533753395, "learning_rate": 1e-06, "loss": 0.0038, "step": 511 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.12153895944356918, "epoch": 1.3473684210526315, "grad_norm": 0.0278632752597332, "learning_rate": 1e-06, "loss": 0.0202, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 1049.525390625, "completions/mean_terminated_length": 712.84033203125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.1217559427022934, "epoch": 1.35, "frac_reward_zero_std": 0.5, "grad_norm": 0.01790975034236908, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 167447669.0, "reward": 0.8032069802284241, "reward_std": 0.15127655863761902, "rewards/progression_diversity/mean": -0.0015715567860752344, "rewards/progression_diversity/std": 0.016383837908506393, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9267578125, "rewards/symbolic_reward_partial_score/std": 0.23392236232757568, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0203694105148315, "sampling/importance_sampling_ratio/min": 7.359984124377661e-07, "sampling/sampling_logp_difference/max": 14.122037887573242, "sampling/sampling_logp_difference/mean": 0.06768319010734558, "step": 513 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.1187475174665451, "epoch": 1.3526315789473684, "grad_norm": 0.009799002669751644, "learning_rate": 1e-06, "loss": 0.0417, "step": 514 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.11943993717432022, "epoch": 1.3552631578947367, "grad_norm": 0.020848069339990616, "learning_rate": 1e-06, "loss": 0.0084, "step": 515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.12406942620873451, "epoch": 1.3578947368421053, "grad_norm": 0.019777188077569008, "learning_rate": 1e-06, "loss": -0.0014, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11133.0, "completions/mean_length": 1168.974609375, "completions/mean_terminated_length": 834.9121704101562, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.1224772073328495, "epoch": 1.3605263157894738, "frac_reward_zero_std": 0.46875, "grad_norm": 0.021967625245451927, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 168440488.0, "reward": 0.8120394945144653, "reward_std": 0.16349293291568756, "rewards/progression_diversity/mean": -0.0021093892864882946, "rewards/progression_diversity/std": 0.01896587759256363, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, "rewards/symbolic_reward_partial_score/std": 0.24613560736179352, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0184623003005981, "sampling/importance_sampling_ratio/min": 1.3710611028727726e-06, "sampling/sampling_logp_difference/max": 13.49992561340332, "sampling/sampling_logp_difference/mean": 0.06050651893019676, "step": 517 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.11167807132005692, "epoch": 1.3631578947368421, "grad_norm": 0.03911877050995827, "learning_rate": 1e-06, "loss": 0.083, "step": 518 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.11478234827518463, "epoch": 1.3657894736842104, "grad_norm": 0.020847424864768982, "learning_rate": 1e-06, "loss": 0.0529, "step": 519 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.11543101072311401, "epoch": 1.368421052631579, "grad_norm": 0.018829762935638428, "learning_rate": 1e-06, "loss": 0.0307, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6398.0, "completions/mean_length": 1195.33984375, "completions/mean_terminated_length": 736.9295654296875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.12062574923038483, "epoch": 1.3710526315789473, "frac_reward_zero_std": 0.53125, "grad_norm": 0.014021288603544235, "learning_rate": 1e-06, "loss": 0.0427, "num_tokens": 169450134.0, "reward": 0.7832722663879395, "reward_std": 0.13144497573375702, "rewards/progression_diversity/mean": -0.002853758167475462, "rewards/progression_diversity/std": 0.023874642327427864, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.9137369394302368, "rewards/symbolic_reward_partial_score/std": 0.2524980306625366, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0197254419326782, "sampling/importance_sampling_ratio/min": 1.406861771338595e-16, "sampling/sampling_logp_difference/max": 36.5, "sampling/sampling_logp_difference/mean": 0.064872145652771, "step": 521 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.12126883119344711, "epoch": 1.3736842105263158, "grad_norm": 0.018665987998247147, "learning_rate": 1e-06, "loss": 0.0304, "step": 522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.1257994957268238, "epoch": 1.3763157894736842, "grad_norm": 0.028348246589303017, "learning_rate": 1e-06, "loss": 0.0122, "step": 523 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.12155702710151672, "epoch": 1.3789473684210527, "grad_norm": 0.027928415685892105, "learning_rate": 1e-06, "loss": 0.0405, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15986.0, "completions/mean_length": 1216.671875, "completions/mean_terminated_length": 758.9053955078125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.12585609033703804, "epoch": 1.381578947368421, "frac_reward_zero_std": 0.53125, "grad_norm": 0.020609745755791664, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 170497390.0, "reward": 0.8209607601165771, "reward_std": 0.13462477922439575, "rewards/progression_diversity/mean": -0.003541269339621067, "rewards/progression_diversity/std": 0.028825946152210236, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, "rewards/symbolic_reward_partial_score/std": 0.22360049188137054, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0208656787872314, "sampling/importance_sampling_ratio/min": 2.124352249666117e-07, "sampling/sampling_logp_difference/max": 15.364628791809082, "sampling/sampling_logp_difference/mean": 0.06426618993282318, "step": 525 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.1270783357322216, "epoch": 1.3842105263157896, "grad_norm": 0.015202553011476994, "learning_rate": 1e-06, "loss": 0.0751, "step": 526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.13192099332809448, "epoch": 1.3868421052631579, "grad_norm": 0.02699258364737034, "learning_rate": 1e-06, "loss": 0.022, "step": 527 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.13195763528347015, "epoch": 1.3894736842105262, "grad_norm": 0.02506939321756363, "learning_rate": 1e-06, "loss": 0.0573, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5820.0, "completions/mean_length": 1409.9921875, "completions/mean_terminated_length": 705.6932373046875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.132181815803051, "epoch": 1.3921052631578947, "frac_reward_zero_std": 0.5, "grad_norm": 0.015631964430212975, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 171635146.0, "reward": 0.7929407954216003, "reward_std": 0.14727577567100525, "rewards/progression_diversity/mean": -0.007682529743760824, "rewards/progression_diversity/std": 0.043747011572122574, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.9239909052848816, "rewards/symbolic_reward_partial_score/std": 0.23265688121318817, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0227715969085693, "sampling/importance_sampling_ratio/min": 1.4806555270752142e-07, "sampling/sampling_logp_difference/max": 15.725610733032227, "sampling/sampling_logp_difference/mean": 0.06556607782840729, "step": 529 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.12692655250430107, "epoch": 1.3947368421052633, "grad_norm": 0.013282880187034607, "learning_rate": 1e-06, "loss": 0.0761, "step": 530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.13208907842636108, "epoch": 1.3973684210526316, "grad_norm": 0.01377064548432827, "learning_rate": 1e-06, "loss": 0.0366, "step": 531 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.13234106451272964, "epoch": 1.4, "grad_norm": 0.020579254254698753, "learning_rate": 1e-06, "loss": 0.0348, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4966.0, "completions/mean_length": 1032.94921875, "completions/mean_terminated_length": 695.9002075195312, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.12633076682686806, "epoch": 1.4026315789473685, "frac_reward_zero_std": 0.53125, "grad_norm": 0.039995189756155014, "learning_rate": 1e-06, "loss": 0.101, "num_tokens": 172575824.0, "reward": 0.8291662931442261, "reward_std": 0.12353667616844177, "rewards/progression_diversity/mean": -0.003291813191026449, "rewards/progression_diversity/std": 0.029986631125211716, "rewards/symbolic_reward_accuracy/mean": 0.912109375, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.9469400644302368, "rewards/symbolic_reward_partial_score/std": 0.20057909190654755, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0211803913116455, "sampling/importance_sampling_ratio/min": 8.730064109840896e-06, "sampling/sampling_logp_difference/max": 11.648737907409668, "sampling/sampling_logp_difference/mean": 0.0656655952334404, "step": 533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.13137900829315186, "epoch": 1.4052631578947368, "grad_norm": 0.018619263544678688, "learning_rate": 1e-06, "loss": -0.0028, "step": 534 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.1252904236316681, "epoch": 1.4078947368421053, "grad_norm": 0.015142420306801796, "learning_rate": 1e-06, "loss": 0.0256, "step": 535 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.13380999118089676, "epoch": 1.4105263157894736, "grad_norm": 0.0105243269354105, "learning_rate": 1e-06, "loss": 0.0065, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 1088.859375, "completions/mean_terminated_length": 658.87548828125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.13350340723991394, "epoch": 1.4131578947368422, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03917808085680008, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 173525096.0, "reward": 0.8361979722976685, "reward_std": 0.11943801492452621, "rewards/progression_diversity/mean": -0.0032509397715330124, "rewards/progression_diversity/std": 0.02727237343788147, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.9488931894302368, "rewards/symbolic_reward_partial_score/std": 0.2026350498199463, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0211467742919922, "sampling/importance_sampling_ratio/min": 5.162648449186236e-06, "sampling/sampling_logp_difference/max": 12.174060821533203, "sampling/sampling_logp_difference/mean": 0.0664573684334755, "step": 537 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.13789378851652145, "epoch": 1.4157894736842105, "grad_norm": 0.008778770454227924, "learning_rate": 1e-06, "loss": 0.0109, "step": 538 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.13081741333007812, "epoch": 1.418421052631579, "grad_norm": 0.019565429538488388, "learning_rate": 1e-06, "loss": 0.0334, "step": 539 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.12705697864294052, "epoch": 1.4210526315789473, "grad_norm": 0.020435446873307228, "learning_rate": 1e-06, "loss": 0.0642, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7683.0, "completions/mean_length": 1142.486328125, "completions/mean_terminated_length": 619.0404052734375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.12402678281068802, "epoch": 1.4236842105263157, "frac_reward_zero_std": 0.65625, "grad_norm": 0.022379858419299126, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 174504833.0, "reward": 0.8272143006324768, "reward_std": 0.06737488508224487, "rewards/progression_diversity/mean": -0.0031863353215157986, "rewards/progression_diversity/std": 0.028556160628795624, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9475911259651184, "rewards/symbolic_reward_partial_score/std": 0.2016286551952362, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0205636024475098, "sampling/importance_sampling_ratio/min": 2.388037658853444e-15, "sampling/sampling_logp_difference/max": 33.668304443359375, "sampling/sampling_logp_difference/mean": 0.06742183864116669, "step": 541 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.13175051659345627, "epoch": 1.4263157894736842, "grad_norm": 0.009576407261192799, "learning_rate": 1e-06, "loss": 0.0465, "step": 542 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.12570127844810486, "epoch": 1.4289473684210527, "grad_norm": 0.011829288676381111, "learning_rate": 1e-06, "loss": 0.0597, "step": 543 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.1273176595568657, "epoch": 1.431578947368421, "grad_norm": 0.01299221906810999, "learning_rate": 1e-06, "loss": 0.0145, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16202.0, "completions/mean_length": 961.04296875, "completions/mean_terminated_length": 622.4151611328125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.11435278132557869, "epoch": 1.4342105263157894, "frac_reward_zero_std": 0.5625, "grad_norm": 0.030455466359853745, "learning_rate": 1e-06, "loss": 0.0736, "num_tokens": 175375831.0, "reward": 0.8315600156784058, "reward_std": 0.10956034064292908, "rewards/progression_diversity/mean": -0.00318007729947567, "rewards/progression_diversity/std": 0.026876848191022873, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, "rewards/symbolic_reward_partial_score/std": 0.19548769295215607, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0186996459960938, "sampling/importance_sampling_ratio/min": 6.467719504144043e-05, "sampling/sampling_logp_difference/max": 9.646101951599121, "sampling/sampling_logp_difference/mean": 0.06452836096286774, "step": 545 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.1248774528503418, "epoch": 1.436842105263158, "grad_norm": 0.030380593612790108, "learning_rate": 1e-06, "loss": 0.0411, "step": 546 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.11489719897508621, "epoch": 1.4394736842105262, "grad_norm": 0.013684880919754505, "learning_rate": 1e-06, "loss": 0.031, "step": 547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.12723349034786224, "epoch": 1.4421052631578948, "grad_norm": 0.005004690028727055, "learning_rate": 1e-06, "loss": 0.0033, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7127.0, "completions/mean_length": 1883.12109375, "completions/mean_terminated_length": 720.6033325195312, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.11534935608506203, "epoch": 1.444736842105263, "frac_reward_zero_std": 0.375, "grad_norm": 0.0357719361782074, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 176770005.0, "reward": 0.7715877294540405, "reward_std": 0.17539432644844055, "rewards/progression_diversity/mean": -0.009200896136462688, "rewards/progression_diversity/std": 0.044748030602931976, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.8977864980697632, "rewards/symbolic_reward_partial_score/std": 0.2720419764518738, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.017478108406067, "sampling/importance_sampling_ratio/min": 2.1775213099317625e-05, "sampling/sampling_logp_difference/max": 10.73473834991455, "sampling/sampling_logp_difference/mean": 0.05594378709793091, "step": 549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.11893630772829056, "epoch": 1.4473684210526316, "grad_norm": 0.020856492221355438, "learning_rate": 1e-06, "loss": 0.0626, "step": 550 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.10961251333355904, "epoch": 1.45, "grad_norm": 0.032535944133996964, "learning_rate": 1e-06, "loss": 0.1122, "step": 551 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.12273216247558594, "epoch": 1.4526315789473685, "grad_norm": 0.015543154440820217, "learning_rate": 1e-06, "loss": 0.0734, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3752.0, "completions/mean_length": 1222.4609375, "completions/mean_terminated_length": 638.1419677734375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.12245375290513039, "epoch": 1.4552631578947368, "frac_reward_zero_std": 0.65625, "grad_norm": 0.010475813411176205, "learning_rate": 1e-06, "loss": 0.0767, "num_tokens": 177793537.0, "reward": 0.8133152723312378, "reward_std": 0.0920829027891159, "rewards/progression_diversity/mean": -0.006364539731293917, "rewards/progression_diversity/std": 0.04115993529558182, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9345703125, "rewards/symbolic_reward_partial_score/std": 0.2200278490781784, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0192129611968994, "sampling/importance_sampling_ratio/min": 5.405944246937577e-13, "sampling/sampling_logp_difference/max": 28.24610710144043, "sampling/sampling_logp_difference/mean": 0.06582315266132355, "step": 553 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.1257517747581005, "epoch": 1.4578947368421051, "grad_norm": 0.010606672614812851, "learning_rate": 1e-06, "loss": 0.0181, "step": 554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.1262059360742569, "epoch": 1.4605263157894737, "grad_norm": 0.027410009875893593, "learning_rate": 1e-06, "loss": 0.05, "step": 555 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.12493745982646942, "epoch": 1.4631578947368422, "grad_norm": 0.011104092933237553, "learning_rate": 1e-06, "loss": 0.0094, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6435.0, "completions/mean_length": 829.318359375, "completions/mean_terminated_length": 644.8755493164062, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.1371982842683792, "epoch": 1.4657894736842105, "frac_reward_zero_std": 0.53125, "grad_norm": 0.05659002810716629, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 178611268.0, "reward": 0.8500331044197083, "reward_std": 0.11168558895587921, "rewards/progression_diversity/mean": -0.001577683025971055, "rewards/progression_diversity/std": 0.019814975559711456, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.96240234375, "rewards/symbolic_reward_partial_score/std": 0.17325447499752045, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0206682682037354, "sampling/importance_sampling_ratio/min": 8.104348694359942e-07, "sampling/sampling_logp_difference/max": 14.025694847106934, "sampling/sampling_logp_difference/mean": 0.0689624696969986, "step": 557 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.13341526687145233, "epoch": 1.4684210526315788, "grad_norm": 0.010691308416426182, "learning_rate": 1e-06, "loss": -0.005, "step": 558 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.13226507604122162, "epoch": 1.4710526315789474, "grad_norm": 0.01808001846075058, "learning_rate": 1e-06, "loss": 0.0304, "step": 559 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.12875394523143768, "epoch": 1.4736842105263157, "grad_norm": 0.01617010124027729, "learning_rate": 1e-06, "loss": 0.0914, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5451.0, "completions/mean_length": 817.533203125, "completions/mean_terminated_length": 664.0177612304688, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.1301540583372116, "epoch": 1.4763157894736842, "frac_reward_zero_std": 0.53125, "grad_norm": 0.026065807789564133, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 179427317.0, "reward": 0.8277164697647095, "reward_std": 0.10951961576938629, "rewards/progression_diversity/mean": -0.0017917966470122337, "rewards/progression_diversity/std": 0.020995857194066048, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9615885019302368, "rewards/symbolic_reward_partial_score/std": 0.14556662738323212, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0219957828521729, "sampling/importance_sampling_ratio/min": 3.670751175377518e-05, "sampling/sampling_logp_difference/max": 10.212529182434082, "sampling/sampling_logp_difference/mean": 0.0698399469256401, "step": 561 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.13334877789020538, "epoch": 1.4789473684210526, "grad_norm": 0.008394371718168259, "learning_rate": 1e-06, "loss": 0.0013, "step": 562 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.13843107223510742, "epoch": 1.481578947368421, "grad_norm": 0.04058850556612015, "learning_rate": 1e-06, "loss": 0.016, "step": 563 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.13118517398834229, "epoch": 1.4842105263157894, "grad_norm": 0.01614241674542427, "learning_rate": 1e-06, "loss": 0.0288, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6531.0, "completions/mean_length": 730.0703125, "completions/mean_terminated_length": 668.682373046875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.1351623460650444, "epoch": 1.486842105263158, "frac_reward_zero_std": 0.625, "grad_norm": 0.01471333485096693, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 180205241.0, "reward": 0.8588860034942627, "reward_std": 0.09307113289833069, "rewards/progression_diversity/mean": -7.880153134465218e-05, "rewards/progression_diversity/std": 0.0017830750439316034, "rewards/symbolic_reward_accuracy/mean": 0.9453125, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.9736328125, "rewards/symbolic_reward_partial_score/std": 0.13708151876926422, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0231963396072388, "sampling/importance_sampling_ratio/min": 1.8753072481558775e-06, "sampling/sampling_logp_difference/max": 13.186738014221191, "sampling/sampling_logp_difference/mean": 0.07400602847337723, "step": 565 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.13259483128786087, "epoch": 1.4894736842105263, "grad_norm": 0.011994445696473122, "learning_rate": 1e-06, "loss": 0.0206, "step": 566 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.13419703394174576, "epoch": 1.4921052631578946, "grad_norm": 0.009042926132678986, "learning_rate": 1e-06, "loss": 0.0123, "step": 567 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.13261277973651886, "epoch": 1.4947368421052631, "grad_norm": 0.013076446950435638, "learning_rate": 1e-06, "loss": 0.0181, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7868.0, "completions/mean_length": 976.060546875, "completions/mean_terminated_length": 700.3717651367188, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.13073333352804184, "epoch": 1.4973684210526317, "frac_reward_zero_std": 0.375, "grad_norm": 0.017082160338759422, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 181120984.0, "reward": 0.8026704788208008, "reward_std": 0.17961543798446655, "rewards/progression_diversity/mean": -0.0015097158029675484, "rewards/progression_diversity/std": 0.018514791503548622, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9275716543197632, "rewards/symbolic_reward_partial_score/std": 0.2205788642168045, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0213254690170288, "sampling/importance_sampling_ratio/min": 2.747718781392905e-07, "sampling/sampling_logp_difference/max": 15.107324600219727, "sampling/sampling_logp_difference/mean": 0.07009261846542358, "step": 569 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.1352020874619484, "epoch": 1.5, "grad_norm": 0.02589314803481102, "learning_rate": 1e-06, "loss": 0.0211, "step": 570 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.13306277990341187, "epoch": 1.5026315789473683, "grad_norm": 0.018376614898443222, "learning_rate": 1e-06, "loss": 0.014, "step": 571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.1296408474445343, "epoch": 1.5052631578947369, "grad_norm": 0.028880199417471886, "learning_rate": 1e-06, "loss": 0.0695, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5029.0, "completions/mean_length": 1003.607421875, "completions/mean_terminated_length": 665.9141845703125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.1362362802028656, "epoch": 1.5078947368421054, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03721587732434273, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 182032623.0, "reward": 0.8145319223403931, "reward_std": 0.13320812582969666, "rewards/progression_diversity/mean": -0.001894364831969142, "rewards/progression_diversity/std": 0.017769131809473038, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9371744990348816, "rewards/symbolic_reward_partial_score/std": 0.2098037451505661, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0225517749786377, "sampling/importance_sampling_ratio/min": 1.7453003763891715e-15, "sampling/sampling_logp_difference/max": 33.981849670410156, "sampling/sampling_logp_difference/mean": 0.07218128442764282, "step": 573 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.1370982825756073, "epoch": 1.5105263157894737, "grad_norm": 0.01768220029771328, "learning_rate": 1e-06, "loss": 0.0052, "step": 574 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.1406766176223755, "epoch": 1.513157894736842, "grad_norm": 0.025351712480187416, "learning_rate": 1e-06, "loss": 0.0384, "step": 575 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.1350262686610222, "epoch": 1.5157894736842106, "grad_norm": 0.012894677929580212, "learning_rate": 1e-06, "loss": 0.0346, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6116.0, "completions/mean_length": 1377.828125, "completions/mean_terminated_length": 831.0445556640625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.13445448130369186, "epoch": 1.518421052631579, "frac_reward_zero_std": 0.34375, "grad_norm": 0.034472037106752396, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 183164919.0, "reward": 0.7574737071990967, "reward_std": 0.18111687898635864, "rewards/progression_diversity/mean": -0.004586691036820412, "rewards/progression_diversity/std": 0.0347183421254158, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8916015625, "rewards/symbolic_reward_partial_score/std": 0.2719910144805908, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0212225914001465, "sampling/importance_sampling_ratio/min": 3.2291410434481804e-08, "sampling/sampling_logp_difference/max": 17.248464584350586, "sampling/sampling_logp_difference/mean": 0.06880679726600647, "step": 577 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.1360417976975441, "epoch": 1.5210526315789474, "grad_norm": 0.01104611437767744, "learning_rate": 1e-06, "loss": 0.0152, "step": 578 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.12674961611628532, "epoch": 1.5236842105263158, "grad_norm": 0.045898765325546265, "learning_rate": 1e-06, "loss": 0.0459, "step": 579 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.125128373503685, "epoch": 1.526315789473684, "grad_norm": 0.015659412369132042, "learning_rate": 1e-06, "loss": 0.0575, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5905.0, "completions/mean_length": 1173.80859375, "completions/mean_terminated_length": 714.7484741210938, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.12383318692445755, "epoch": 1.5289473684210526, "frac_reward_zero_std": 0.53125, "grad_norm": 0.04507393762469292, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 184160693.0, "reward": 0.8106682300567627, "reward_std": 0.1318516582250595, "rewards/progression_diversity/mean": -0.0025172452442348003, "rewards/progression_diversity/std": 0.021089140325784683, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9308268427848816, "rewards/symbolic_reward_partial_score/std": 0.2209477424621582, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0210371017456055, "sampling/importance_sampling_ratio/min": 1.1693467028006665e-10, "sampling/sampling_logp_difference/max": 22.86940574645996, "sampling/sampling_logp_difference/mean": 0.07237155735492706, "step": 581 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.12782908231019974, "epoch": 1.5315789473684212, "grad_norm": 0.010001985356211662, "learning_rate": 1e-06, "loss": 0.0502, "step": 582 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.12940338253974915, "epoch": 1.5342105263157895, "grad_norm": 0.022164426743984222, "learning_rate": 1e-06, "loss": 0.0063, "step": 583 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.1259337067604065, "epoch": 1.5368421052631578, "grad_norm": 0.010080978274345398, "learning_rate": 1e-06, "loss": 0.0076, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5788.0, "completions/mean_length": 1110.978515625, "completions/mean_terminated_length": 744.426025390625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.11713157966732979, "epoch": 1.5394736842105263, "frac_reward_zero_std": 0.4375, "grad_norm": 0.034748658537864685, "learning_rate": 1e-06, "loss": 0.0692, "num_tokens": 185156842.0, "reward": 0.8088133931159973, "reward_std": 0.16672645509243011, "rewards/progression_diversity/mean": -0.0024531371891498566, "rewards/progression_diversity/std": 0.021702419966459274, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9337564706802368, "rewards/symbolic_reward_partial_score/std": 0.2161990851163864, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.019039511680603, "sampling/importance_sampling_ratio/min": 1.6557880826439941e-06, "sampling/sampling_logp_difference/max": 13.311233520507812, "sampling/sampling_logp_difference/mean": 0.06788427382707596, "step": 585 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.12097888439893723, "epoch": 1.5421052631578949, "grad_norm": 0.02093541994690895, "learning_rate": 1e-06, "loss": 0.0109, "step": 586 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.12494004517793655, "epoch": 1.5447368421052632, "grad_norm": 0.012646614573895931, "learning_rate": 1e-06, "loss": -0.0019, "step": 587 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.12453506141901016, "epoch": 1.5473684210526315, "grad_norm": 0.020611083135008812, "learning_rate": 1e-06, "loss": 0.0183, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4762.0, "completions/mean_length": 1052.017578125, "completions/mean_terminated_length": 715.38720703125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.1217985488474369, "epoch": 1.55, "frac_reward_zero_std": 0.40625, "grad_norm": 0.038163021206855774, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 186078931.0, "reward": 0.8041250109672546, "reward_std": 0.16053986549377441, "rewards/progression_diversity/mean": -0.002542970236390829, "rewards/progression_diversity/std": 0.02396521344780922, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9220377206802368, "rewards/symbolic_reward_partial_score/std": 0.2436649203300476, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0195398330688477, "sampling/importance_sampling_ratio/min": 1.0385434734416776e-07, "sampling/sampling_logp_difference/max": 16.080276489257812, "sampling/sampling_logp_difference/mean": 0.07104349136352539, "step": 589 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.12670139968395233, "epoch": 1.5526315789473686, "grad_norm": 0.02337627112865448, "learning_rate": 1e-06, "loss": 0.0112, "step": 590 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.12126578390598297, "epoch": 1.555263157894737, "grad_norm": 0.02081996016204357, "learning_rate": 1e-06, "loss": 0.0416, "step": 591 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.121041189879179, "epoch": 1.5578947368421052, "grad_norm": 0.014259042218327522, "learning_rate": 1e-06, "loss": 0.0228, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6324.0, "completions/mean_length": 1310.251953125, "completions/mean_terminated_length": 761.006103515625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.11463157832622528, "epoch": 1.5605263157894735, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03503033518791199, "learning_rate": 1e-06, "loss": 0.0623, "num_tokens": 187157940.0, "reward": 0.774095892906189, "reward_std": 0.19663885235786438, "rewards/progression_diversity/mean": -0.002528228797018528, "rewards/progression_diversity/std": 0.020892662927508354, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9150390625, "rewards/symbolic_reward_partial_score/std": 0.22795869410037994, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0178115367889404, "sampling/importance_sampling_ratio/min": 1.8364110587754112e-07, "sampling/sampling_logp_difference/max": 15.510282516479492, "sampling/sampling_logp_difference/mean": 0.06724664568901062, "step": 593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.11785297840833664, "epoch": 1.563157894736842, "grad_norm": 0.015631457790732384, "learning_rate": 1e-06, "loss": 0.0155, "step": 594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.11847185716032982, "epoch": 1.5657894736842106, "grad_norm": 0.025584295392036438, "learning_rate": 1e-06, "loss": 0.034, "step": 595 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.12040146440267563, "epoch": 1.568421052631579, "grad_norm": 0.024837322533130646, "learning_rate": 1e-06, "loss": 0.0075, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7156.0, "completions/mean_length": 1281.55859375, "completions/mean_terminated_length": 762.888916015625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.11434166878461838, "epoch": 1.5710526315789473, "frac_reward_zero_std": 0.25, "grad_norm": 0.031757909804582596, "learning_rate": 1e-06, "loss": 0.0399, "num_tokens": 188220754.0, "reward": 0.7410222291946411, "reward_std": 0.2251667082309723, "rewards/progression_diversity/mean": -0.004226570948958397, "rewards/progression_diversity/std": 0.034974951297044754, "rewards/symbolic_reward_accuracy/mean": 0.8046875, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.8719075322151184, "rewards/symbolic_reward_partial_score/std": 0.29337501525878906, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.018049955368042, "sampling/importance_sampling_ratio/min": 1.5532393717876403e-06, "sampling/sampling_logp_difference/max": 13.375167846679688, "sampling/sampling_logp_difference/mean": 0.06641073524951935, "step": 597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.1158062033355236, "epoch": 1.5736842105263158, "grad_norm": 0.05890418589115143, "learning_rate": 1e-06, "loss": 0.0376, "step": 598 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.11098961159586906, "epoch": 1.5763157894736843, "grad_norm": 0.0232480950653553, "learning_rate": 1e-06, "loss": 0.042, "step": 599 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.11672716960310936, "epoch": 1.5789473684210527, "grad_norm": 0.019831934943795204, "learning_rate": 1e-06, "loss": 0.0268, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6496.0, "completions/mean_length": 1332.814453125, "completions/mean_terminated_length": 752.7484741210938, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.11477554962038994, "epoch": 1.581578947368421, "frac_reward_zero_std": 0.125, "grad_norm": 0.03628187254071236, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 189330323.0, "reward": 0.7476116418838501, "reward_std": 0.23416918516159058, "rewards/progression_diversity/mean": -0.004457623697817326, "rewards/progression_diversity/std": 0.032657936215400696, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.8834635019302368, "rewards/symbolic_reward_partial_score/std": 0.28327476978302, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0170378684997559, "sampling/importance_sampling_ratio/min": 2.704789210383751e-07, "sampling/sampling_logp_difference/max": 15.123071670532227, "sampling/sampling_logp_difference/mean": 0.06537644565105438, "step": 601 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.11666785180568695, "epoch": 1.5842105263157895, "grad_norm": 0.03475677967071533, "learning_rate": 1e-06, "loss": 0.0451, "step": 602 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.11766305565834045, "epoch": 1.586842105263158, "grad_norm": 0.016544286161661148, "learning_rate": 1e-06, "loss": 0.0143, "step": 603 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.11409174650907516, "epoch": 1.5894736842105264, "grad_norm": 0.04165460914373398, "learning_rate": 1e-06, "loss": 0.0841, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7985.0, "completions/mean_length": 1443.87890625, "completions/mean_terminated_length": 773.097900390625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.11511170864105225, "epoch": 1.5921052631578947, "frac_reward_zero_std": 0.28125, "grad_norm": 0.030341695994138718, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 190460885.0, "reward": 0.7671371698379517, "reward_std": 0.20124712586402893, "rewards/progression_diversity/mean": -0.005033410154283047, "rewards/progression_diversity/std": 0.03523511067032814, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.8912760615348816, "rewards/symbolic_reward_partial_score/std": 0.28599029779434204, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0175724029541016, "sampling/importance_sampling_ratio/min": 5.44225827070477e-07, "sampling/sampling_logp_difference/max": 14.423901557922363, "sampling/sampling_logp_difference/mean": 0.06623612344264984, "step": 605 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.11644968762993813, "epoch": 1.594736842105263, "grad_norm": 0.019640203565359116, "learning_rate": 1e-06, "loss": 0.0203, "step": 606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.11942460760474205, "epoch": 1.5973684210526315, "grad_norm": 0.018807774409651756, "learning_rate": 1e-06, "loss": 0.0451, "step": 607 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.11645659804344177, "epoch": 1.6, "grad_norm": 0.018924150615930557, "learning_rate": 1e-06, "loss": 0.044, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6067.0, "completions/mean_length": 829.654296875, "completions/mean_terminated_length": 737.9783935546875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.12432442978024483, "epoch": 1.6026315789473684, "frac_reward_zero_std": 0.4375, "grad_norm": 0.025178352370858192, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 191283460.0, "reward": 0.8110204339027405, "reward_std": 0.1572936773300171, "rewards/progression_diversity/mean": -0.001474375487305224, "rewards/progression_diversity/std": 0.02059447206556797, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9358724355697632, "rewards/symbolic_reward_partial_score/std": 0.20960327982902527, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0208200216293335, "sampling/importance_sampling_ratio/min": 8.66128682305932e-22, "sampling/sampling_logp_difference/max": 48.498008728027344, "sampling/sampling_logp_difference/mean": 0.07230201363563538, "step": 609 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.12939514964818954, "epoch": 1.6052631578947367, "grad_norm": 0.012045629322528839, "learning_rate": 1e-06, "loss": 0.0036, "step": 610 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.12374605610966682, "epoch": 1.6078947368421053, "grad_norm": 0.019549217075109482, "learning_rate": 1e-06, "loss": 0.0286, "step": 611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.12776687741279602, "epoch": 1.6105263157894738, "grad_norm": 0.014186064712703228, "learning_rate": 1e-06, "loss": 0.0144, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5412.0, "completions/mean_length": 941.5625, "completions/mean_terminated_length": 758.4506225585938, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.12887892127037048, "epoch": 1.6131578947368421, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04177960380911827, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 192155012.0, "reward": 0.7932851314544678, "reward_std": 0.19080910086631775, "rewards/progression_diversity/mean": -0.0025479630567133427, "rewards/progression_diversity/std": 0.03208750858902931, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.90478515625, "rewards/symbolic_reward_partial_score/std": 0.272712379693985, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0208125114440918, "sampling/importance_sampling_ratio/min": 1.5786597487021936e-06, "sampling/sampling_logp_difference/max": 13.35893440246582, "sampling/sampling_logp_difference/mean": 0.07315555959939957, "step": 613 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.1350940614938736, "epoch": 1.6157894736842104, "grad_norm": 0.012729027308523655, "learning_rate": 1e-06, "loss": 0.0034, "step": 614 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.12994906306266785, "epoch": 1.618421052631579, "grad_norm": 0.02224569581449032, "learning_rate": 1e-06, "loss": 0.0451, "step": 615 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.13485563546419144, "epoch": 1.6210526315789475, "grad_norm": 0.023328816518187523, "learning_rate": 1e-06, "loss": 0.0112, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4567.0, "completions/mean_length": 1037.134765625, "completions/mean_terminated_length": 700.1776733398438, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.13107465207576752, "epoch": 1.6236842105263158, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03819314390420914, "learning_rate": 1e-06, "loss": 0.0452, "num_tokens": 193093321.0, "reward": 0.790679931640625, "reward_std": 0.1968342363834381, "rewards/progression_diversity/mean": -0.004279204178601503, "rewards/progression_diversity/std": 0.03591744601726532, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9202473759651184, "rewards/symbolic_reward_partial_score/std": 0.24266402423381805, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0211315155029297, "sampling/importance_sampling_ratio/min": 5.997556399961468e-06, "sampling/sampling_logp_difference/max": 12.024158477783203, "sampling/sampling_logp_difference/mean": 0.07138025015592575, "step": 617 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.13448263704776764, "epoch": 1.6263157894736842, "grad_norm": 0.0195352453738451, "learning_rate": 1e-06, "loss": 0.0212, "step": 618 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.12872284650802612, "epoch": 1.6289473684210525, "grad_norm": 0.035076647996902466, "learning_rate": 1e-06, "loss": 0.0472, "step": 619 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.12851262837648392, "epoch": 1.631578947368421, "grad_norm": 0.017521562054753304, "learning_rate": 1e-06, "loss": 0.0411, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5643.0, "completions/mean_length": 1071.30078125, "completions/mean_terminated_length": 766.2669677734375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.12900108844041824, "epoch": 1.6342105263157896, "frac_reward_zero_std": 0.34375, "grad_norm": 0.03199876844882965, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 194057987.0, "reward": 0.7672144174575806, "reward_std": 0.19012746214866638, "rewards/progression_diversity/mean": -0.002193653956055641, "rewards/progression_diversity/std": 0.022286431863904, "rewards/symbolic_reward_accuracy/mean": 0.830078125, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.9031575918197632, "rewards/symbolic_reward_partial_score/std": 0.2527180016040802, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0220717191696167, "sampling/importance_sampling_ratio/min": 3.4926625480657947e-10, "sampling/sampling_logp_difference/max": 21.77518653869629, "sampling/sampling_logp_difference/mean": 0.0735945776104927, "step": 621 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.13501236587762833, "epoch": 1.6368421052631579, "grad_norm": 0.023030543699860573, "learning_rate": 1e-06, "loss": -0.0017, "step": 622 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.13038154691457748, "epoch": 1.6394736842105262, "grad_norm": 0.020995106548070908, "learning_rate": 1e-06, "loss": 0.0482, "step": 623 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.13032087683677673, "epoch": 1.6421052631578947, "grad_norm": 0.01794985868036747, "learning_rate": 1e-06, "loss": 0.0357, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4610.0, "completions/mean_length": 1164.958984375, "completions/mean_terminated_length": 578.4239501953125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.13446198403835297, "epoch": 1.6447368421052633, "frac_reward_zero_std": 0.375, "grad_norm": 0.02450123056769371, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 195026894.0, "reward": 0.8016812801361084, "reward_std": 0.14666664600372314, "rewards/progression_diversity/mean": -0.007652563974261284, "rewards/progression_diversity/std": 0.04430336132645607, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9231771230697632, "rewards/symbolic_reward_partial_score/std": 0.2403518557548523, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.02314293384552, "sampling/importance_sampling_ratio/min": 3.1002757168607786e-05, "sampling/sampling_logp_difference/max": 10.381434440612793, "sampling/sampling_logp_difference/mean": 0.07610031962394714, "step": 625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.13796798884868622, "epoch": 1.6473684210526316, "grad_norm": 0.03212736174464226, "learning_rate": 1e-06, "loss": 0.0522, "step": 626 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.13732363283634186, "epoch": 1.65, "grad_norm": 0.03413732722401619, "learning_rate": 1e-06, "loss": 0.0207, "step": 627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.1372433453798294, "epoch": 1.6526315789473685, "grad_norm": 0.029417581856250763, "learning_rate": 1e-06, "loss": 0.0398, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5332.0, "completions/mean_length": 1251.904296875, "completions/mean_terminated_length": 636.7784423828125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.13081193715333939, "epoch": 1.655263157894737, "frac_reward_zero_std": 0.34375, "grad_norm": 0.03926420584321022, "learning_rate": 1e-06, "loss": 0.086, "num_tokens": 196044989.0, "reward": 0.7881196737289429, "reward_std": 0.17913874983787537, "rewards/progression_diversity/mean": -0.00639567244797945, "rewards/progression_diversity/std": 0.04198707640171051, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9085286855697632, "rewards/symbolic_reward_partial_score/std": 0.2608047127723694, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.023348331451416, "sampling/importance_sampling_ratio/min": 1.0149918125534896e-05, "sampling/sampling_logp_difference/max": 11.498044967651367, "sampling/sampling_logp_difference/mean": 0.07550258934497833, "step": 629 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.138485848903656, "epoch": 1.6578947368421053, "grad_norm": 0.01309316884726286, "learning_rate": 1e-06, "loss": 0.0257, "step": 630 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.13896264880895615, "epoch": 1.6605263157894736, "grad_norm": 0.012009157799184322, "learning_rate": 1e-06, "loss": 0.0538, "step": 631 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.1417469009757042, "epoch": 1.663157894736842, "grad_norm": 0.024768110364675522, "learning_rate": 1e-06, "loss": 0.0391, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6066.0, "completions/mean_length": 731.333984375, "completions/mean_terminated_length": 639.07861328125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.14609570801258087, "epoch": 1.6657894736842105, "frac_reward_zero_std": 0.40625, "grad_norm": 0.017739422619342804, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 196802888.0, "reward": 0.8241581916809082, "reward_std": 0.1724630892276764, "rewards/progression_diversity/mean": -0.0011804921086877584, "rewards/progression_diversity/std": 0.019015712663531303, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9327799677848816, "rewards/symbolic_reward_partial_score/std": 0.2337876856327057, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0246440172195435, "sampling/importance_sampling_ratio/min": 1.7091028894355986e-06, "sampling/sampling_logp_difference/max": 13.279541969299316, "sampling/sampling_logp_difference/mean": 0.08123795688152313, "step": 633 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.14178509265184402, "epoch": 1.668421052631579, "grad_norm": 0.027597403153777122, "learning_rate": 1e-06, "loss": 0.0282, "step": 634 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.14551638811826706, "epoch": 1.6710526315789473, "grad_norm": 0.006523482967168093, "learning_rate": 1e-06, "loss": 0.0247, "step": 635 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.1471918523311615, "epoch": 1.6736842105263157, "grad_norm": 0.019637571647763252, "learning_rate": 1e-06, "loss": 0.0191, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 711.6328125, "completions/mean_terminated_length": 588.2283325195312, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.15460152179002762, "epoch": 1.6763157894736842, "frac_reward_zero_std": 0.4375, "grad_norm": 0.016574524343013763, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 197560204.0, "reward": 0.855990469455719, "reward_std": 0.12720367312431335, "rewards/progression_diversity/mean": -0.0015454285312443972, "rewards/progression_diversity/std": 0.024757709354162216, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.9685872793197632, "rewards/symbolic_reward_partial_score/std": 0.14984627068042755, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0268505811691284, "sampling/importance_sampling_ratio/min": 1.079319758900965e-06, "sampling/sampling_logp_difference/max": 13.739179611206055, "sampling/sampling_logp_difference/mean": 0.08289426565170288, "step": 637 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.15232256054878235, "epoch": 1.6789473684210527, "grad_norm": 0.00705451937392354, "learning_rate": 1e-06, "loss": -0.0044, "step": 638 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.1456858143210411, "epoch": 1.681578947368421, "grad_norm": 0.025568673387169838, "learning_rate": 1e-06, "loss": 0.0668, "step": 639 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.1482095867395401, "epoch": 1.6842105263157894, "grad_norm": 0.02351662516593933, "learning_rate": 1e-06, "loss": 0.0198, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5108.0, "completions/mean_length": 814.15234375, "completions/mean_terminated_length": 629.5296630859375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.14524078369140625, "epoch": 1.686842105263158, "frac_reward_zero_std": 0.375, "grad_norm": 0.02617044374346733, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 198374842.0, "reward": 0.8252705335617065, "reward_std": 0.16813138127326965, "rewards/progression_diversity/mean": -0.0022450610995292664, "rewards/progression_diversity/std": 0.032739993184804916, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9488931894302368, "rewards/symbolic_reward_partial_score/std": 0.18802446126937866, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0245275497436523, "sampling/importance_sampling_ratio/min": 5.705417915891076e-09, "sampling/sampling_logp_difference/max": 18.981849670410156, "sampling/sampling_logp_difference/mean": 0.07951345294713974, "step": 641 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.1453712061047554, "epoch": 1.6894736842105265, "grad_norm": 0.030677122995257378, "learning_rate": 1e-06, "loss": 0.0483, "step": 642 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.14691882580518723, "epoch": 1.6921052631578948, "grad_norm": 0.009873943403363228, "learning_rate": 1e-06, "loss": -0.0022, "step": 643 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.14735910296440125, "epoch": 1.694736842105263, "grad_norm": 0.017524579539895058, "learning_rate": 1e-06, "loss": 0.0235, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4121.0, "completions/mean_length": 1686.984375, "completions/mean_terminated_length": 739.7754516601562, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.14734282344579697, "epoch": 1.6973684210526314, "frac_reward_zero_std": 0.34375, "grad_norm": 0.03577494993805885, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 199650066.0, "reward": 0.7072658538818359, "reward_std": 0.2004225254058838, "rewards/progression_diversity/mean": -0.010720351710915565, "rewards/progression_diversity/std": 0.056672148406505585, "rewards/symbolic_reward_accuracy/mean": 0.763671875, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.85009765625, "rewards/symbolic_reward_partial_score/std": 0.32125625014305115, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0221692323684692, "sampling/importance_sampling_ratio/min": 1.0171705753236893e-06, "sampling/sampling_logp_difference/max": 13.79848575592041, "sampling/sampling_logp_difference/mean": 0.06588836014270782, "step": 645 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.13180600851774216, "epoch": 1.7, "grad_norm": 0.016572780907154083, "learning_rate": 1e-06, "loss": 0.0562, "step": 646 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.13006508350372314, "epoch": 1.7026315789473685, "grad_norm": 0.024881578981876373, "learning_rate": 1e-06, "loss": 0.1149, "step": 647 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.13194820284843445, "epoch": 1.7052631578947368, "grad_norm": 0.017025984823703766, "learning_rate": 1e-06, "loss": 0.0827, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13227.0, "completions/mean_length": 966.30859375, "completions/mean_terminated_length": 721.5833740234375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.15003421157598495, "epoch": 1.7078947368421051, "frac_reward_zero_std": 0.34375, "grad_norm": 0.026492591947317123, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 200547280.0, "reward": 0.7986052632331848, "reward_std": 0.18857163190841675, "rewards/progression_diversity/mean": -0.0027552107349038124, "rewards/progression_diversity/std": 0.026247508823871613, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9153645634651184, "rewards/symbolic_reward_partial_score/std": 0.25720417499542236, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.025015115737915, "sampling/importance_sampling_ratio/min": 3.4338822274548875e-07, "sampling/sampling_logp_difference/max": 14.884404182434082, "sampling/sampling_logp_difference/mean": 0.07899387180805206, "step": 649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.15113919973373413, "epoch": 1.7105263157894737, "grad_norm": 0.029967421665787697, "learning_rate": 1e-06, "loss": 0.04, "step": 650 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.14088765531778336, "epoch": 1.7131578947368422, "grad_norm": 0.04313409700989723, "learning_rate": 1e-06, "loss": 0.0421, "step": 651 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.14788048714399338, "epoch": 1.7157894736842105, "grad_norm": 0.014722016640007496, "learning_rate": 1e-06, "loss": 0.0357, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4130.0, "completions/mean_length": 835.263671875, "completions/mean_terminated_length": 619.7366333007812, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.14688391238451004, "epoch": 1.7184210526315788, "frac_reward_zero_std": 0.25, "grad_norm": 0.025073617696762085, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 201358551.0, "reward": 0.7871145606040955, "reward_std": 0.2015751600265503, "rewards/progression_diversity/mean": -0.004367514047771692, "rewards/progression_diversity/std": 0.043641433119773865, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.92333984375, "rewards/symbolic_reward_partial_score/std": 0.23156411945819855, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0254812240600586, "sampling/importance_sampling_ratio/min": 2.4464074429436655e-12, "sampling/sampling_logp_difference/max": 26.736400604248047, "sampling/sampling_logp_difference/mean": 0.08250489830970764, "step": 653 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.14983681589365005, "epoch": 1.7210526315789474, "grad_norm": 0.03017502650618553, "learning_rate": 1e-06, "loss": 0.0162, "step": 654 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.14353607594966888, "epoch": 1.723684210526316, "grad_norm": 0.011741744354367256, "learning_rate": 1e-06, "loss": 0.0456, "step": 655 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.14645575731992722, "epoch": 1.7263157894736842, "grad_norm": 0.0143356928601861, "learning_rate": 1e-06, "loss": 0.0101, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4858.0, "completions/mean_length": 808.5078125, "completions/mean_terminated_length": 623.8182373046875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.14062481373548508, "epoch": 1.7289473684210526, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03301079943776131, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 202161115.0, "reward": 0.7795208692550659, "reward_std": 0.21248871088027954, "rewards/progression_diversity/mean": -0.0020201844163239002, "rewards/progression_diversity/std": 0.022819824516773224, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, "rewards/symbolic_reward_partial_score/std": 0.28262069821357727, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0237058401107788, "sampling/importance_sampling_ratio/min": 4.2048032611319286e-08, "sampling/sampling_logp_difference/max": 16.984453201293945, "sampling/sampling_logp_difference/mean": 0.07847477495670319, "step": 657 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.14044133573770523, "epoch": 1.731578947368421, "grad_norm": 0.013989591039717197, "learning_rate": 1e-06, "loss": 0.0237, "step": 658 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.1442088633775711, "epoch": 1.7342105263157894, "grad_norm": 0.014133965596556664, "learning_rate": 1e-06, "loss": 0.0343, "step": 659 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.14218034595251083, "epoch": 1.736842105263158, "grad_norm": 0.01604565605521202, "learning_rate": 1e-06, "loss": 0.0263, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7880.0, "completions/mean_length": 708.55078125, "completions/mean_terminated_length": 616.1611328125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.14530527591705322, "epoch": 1.7394736842105263, "frac_reward_zero_std": 0.28125, "grad_norm": 0.022363541647791862, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 202912181.0, "reward": 0.8098558187484741, "reward_std": 0.1898079812526703, "rewards/progression_diversity/mean": -0.0007470193086192012, "rewards/progression_diversity/std": 0.011871008202433586, "rewards/symbolic_reward_accuracy/mean": 0.888671875, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.9241536855697632, "rewards/symbolic_reward_partial_score/std": 0.24054944515228271, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0253888368606567, "sampling/importance_sampling_ratio/min": 3.482146238020789e-10, "sampling/sampling_logp_difference/max": 21.778202056884766, "sampling/sampling_logp_difference/mean": 0.08183024823665619, "step": 661 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.1518562138080597, "epoch": 1.7421052631578946, "grad_norm": 0.020019719377160072, "learning_rate": 1e-06, "loss": 0.0076, "step": 662 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.14416959136724472, "epoch": 1.7447368421052631, "grad_norm": 0.04639910161495209, "learning_rate": 1e-06, "loss": 0.0314, "step": 663 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.1428115963935852, "epoch": 1.7473684210526317, "grad_norm": 0.011411590501666069, "learning_rate": 1e-06, "loss": 0.0255, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 8516.0, "completions/mean_length": 1431.109375, "completions/mean_terminated_length": 727.8036499023438, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.13438043743371964, "epoch": 1.75, "frac_reward_zero_std": 0.375, "grad_norm": 0.026513205841183662, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 204042541.0, "reward": 0.6865507364273071, "reward_std": 0.19577746093273163, "rewards/progression_diversity/mean": -0.007034477777779102, "rewards/progression_diversity/std": 0.0463702417910099, "rewards/symbolic_reward_accuracy/mean": 0.732421875, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.8362630605697632, "rewards/symbolic_reward_partial_score/std": 0.3250221014022827, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.02474045753479, "sampling/importance_sampling_ratio/min": 1.1017795123480725e-17, "sampling/sampling_logp_difference/max": 39.047019958496094, "sampling/sampling_logp_difference/mean": 0.07735016196966171, "step": 665 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.13546551764011383, "epoch": 1.7526315789473683, "grad_norm": 0.0192271675914526, "learning_rate": 1e-06, "loss": 0.0348, "step": 666 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.13766096532344818, "epoch": 1.7552631578947369, "grad_norm": 0.017237944528460503, "learning_rate": 1e-06, "loss": 0.0368, "step": 667 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.13993556797504425, "epoch": 1.7578947368421054, "grad_norm": 0.02463732473552227, "learning_rate": 1e-06, "loss": 0.0289, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8065.0, "completions/mean_length": 977.833984375, "completions/mean_terminated_length": 670.9382934570312, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.14678215980529785, "epoch": 1.7605263157894737, "frac_reward_zero_std": 0.40625, "grad_norm": 0.023546183481812477, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 204941848.0, "reward": 0.7797601222991943, "reward_std": 0.18229469656944275, "rewards/progression_diversity/mean": -0.002507382072508335, "rewards/progression_diversity/std": 0.025364823639392853, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.9091796875, "rewards/symbolic_reward_partial_score/std": 0.2537998855113983, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0251445770263672, "sampling/importance_sampling_ratio/min": 1.523035741968215e-08, "sampling/sampling_logp_difference/max": 17.999975204467773, "sampling/sampling_logp_difference/mean": 0.0789584219455719, "step": 669 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.1442192792892456, "epoch": 1.763157894736842, "grad_norm": 0.020266570150852203, "learning_rate": 1e-06, "loss": 0.0654, "step": 670 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.14410430938005447, "epoch": 1.7657894736842106, "grad_norm": 0.00926806777715683, "learning_rate": 1e-06, "loss": 0.0179, "step": 671 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.14117338508367538, "epoch": 1.768421052631579, "grad_norm": 0.008366767317056656, "learning_rate": 1e-06, "loss": 0.0513, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3605.0, "completions/mean_length": 996.298828125, "completions/mean_terminated_length": 658.4451293945312, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.14415088295936584, "epoch": 1.7710526315789474, "frac_reward_zero_std": 0.5, "grad_norm": 0.023884868249297142, "learning_rate": 1e-06, "loss": 0.0597, "num_tokens": 205849745.0, "reward": 0.8204151391983032, "reward_std": 0.13469088077545166, "rewards/progression_diversity/mean": -0.0043905326165258884, "rewards/progression_diversity/std": 0.046648427844047546, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9431966543197632, "rewards/symbolic_reward_partial_score/std": 0.2042941004037857, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0248339176177979, "sampling/importance_sampling_ratio/min": 3.4787309033390557e-08, "sampling/sampling_logp_difference/max": 17.174013137817383, "sampling/sampling_logp_difference/mean": 0.07455458492040634, "step": 673 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.13898008316755295, "epoch": 1.7736842105263158, "grad_norm": 0.029918354004621506, "learning_rate": 1e-06, "loss": 0.0328, "step": 674 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.14073319733142853, "epoch": 1.776315789473684, "grad_norm": 0.01222238689661026, "learning_rate": 1e-06, "loss": 0.0455, "step": 675 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.13992726802825928, "epoch": 1.7789473684210526, "grad_norm": 0.00996064767241478, "learning_rate": 1e-06, "loss": 0.046, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6681.0, "completions/mean_length": 1095.326171875, "completions/mean_terminated_length": 728.3980102539062, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.14075642824172974, "epoch": 1.7815789473684212, "frac_reward_zero_std": 0.21875, "grad_norm": 0.03422519192099571, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 206820696.0, "reward": 0.7328424453735352, "reward_std": 0.20037183165550232, "rewards/progression_diversity/mean": -0.0018944772891700268, "rewards/progression_diversity/std": 0.018759803846478462, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.8836262822151184, "rewards/symbolic_reward_partial_score/std": 0.27410218119621277, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0241965055465698, "sampling/importance_sampling_ratio/min": 8.016753927364562e-11, "sampling/sampling_logp_difference/max": 23.246902465820312, "sampling/sampling_logp_difference/mean": 0.07802345603704453, "step": 677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.1405147835612297, "epoch": 1.7842105263157895, "grad_norm": 0.01229146309196949, "learning_rate": 1e-06, "loss": 0.0232, "step": 678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.13932440429925919, "epoch": 1.7868421052631578, "grad_norm": 0.01690313220024109, "learning_rate": 1e-06, "loss": 0.0334, "step": 679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.14134883880615234, "epoch": 1.7894736842105263, "grad_norm": 0.03306613117456436, "learning_rate": 1e-06, "loss": 0.0357, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14676.0, "completions/mean_length": 1426.642578125, "completions/mean_terminated_length": 755.0877075195312, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.13857001811265945, "epoch": 1.7921052631578949, "frac_reward_zero_std": 0.40625, "grad_norm": 0.03711333125829697, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 207951297.0, "reward": 0.7199710607528687, "reward_std": 0.15880054235458374, "rewards/progression_diversity/mean": -0.00485343299806118, "rewards/progression_diversity/std": 0.03532535955309868, "rewards/symbolic_reward_accuracy/mean": 0.775390625, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.8616536855697632, "rewards/symbolic_reward_partial_score/std": 0.30703648924827576, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0234718322753906, "sampling/importance_sampling_ratio/min": 3.384801416927985e-08, "sampling/sampling_logp_difference/max": 17.201385498046875, "sampling/sampling_logp_difference/mean": 0.07443863153457642, "step": 681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.13849762827157974, "epoch": 1.7947368421052632, "grad_norm": 0.017198480665683746, "learning_rate": 1e-06, "loss": 0.0603, "step": 682 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.13620536029338837, "epoch": 1.7973684210526315, "grad_norm": 0.024120958521962166, "learning_rate": 1e-06, "loss": 0.0262, "step": 683 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.13732768595218658, "epoch": 1.8, "grad_norm": 0.031364087015390396, "learning_rate": 1e-06, "loss": 0.0579, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5863.0, "completions/mean_length": 877.501953125, "completions/mean_terminated_length": 662.5604248046875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.147243432700634, "epoch": 1.8026315789473686, "frac_reward_zero_std": 0.5, "grad_norm": 0.018228838220238686, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 208811746.0, "reward": 0.7966721653938293, "reward_std": 0.1374010145664215, "rewards/progression_diversity/mean": -0.0007563849212601781, "rewards/progression_diversity/std": 0.012015795335173607, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9166666269302368, "rewards/symbolic_reward_partial_score/std": 0.2510036528110504, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0271267890930176, "sampling/importance_sampling_ratio/min": 4.275814262655331e-07, "sampling/sampling_logp_difference/max": 14.665121078491211, "sampling/sampling_logp_difference/mean": 0.08050627261400223, "step": 685 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.14790859073400497, "epoch": 1.805263157894737, "grad_norm": 0.019763531163334846, "learning_rate": 1e-06, "loss": 0.0097, "step": 686 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.15091699361801147, "epoch": 1.8078947368421052, "grad_norm": 0.018503857776522636, "learning_rate": 1e-06, "loss": 0.0235, "step": 687 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.15277040749788284, "epoch": 1.8105263157894735, "grad_norm": 0.014150720089673996, "learning_rate": 1e-06, "loss": 0.0199, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9039.0, "completions/mean_length": 803.11328125, "completions/mean_terminated_length": 649.4556274414062, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.16078810393810272, "epoch": 1.813157894736842, "frac_reward_zero_std": 0.5, "grad_norm": 0.019453125074505806, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 209626588.0, "reward": 0.838794469833374, "reward_std": 0.13540925085544586, "rewards/progression_diversity/mean": -0.0023899937514215708, "rewards/progression_diversity/std": 0.026318540796637535, "rewards/symbolic_reward_accuracy/mean": 0.919921875, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.9581705927848816, "rewards/symbolic_reward_partial_score/std": 0.17204301059246063, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0289819240570068, "sampling/importance_sampling_ratio/min": 5.673148734786082e-06, "sampling/sampling_logp_difference/max": 12.079766273498535, "sampling/sampling_logp_difference/mean": 0.08512907475233078, "step": 689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.15033067017793655, "epoch": 1.8157894736842106, "grad_norm": 0.053703587502241135, "learning_rate": 1e-06, "loss": 0.028, "step": 690 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.16108743101358414, "epoch": 1.818421052631579, "grad_norm": 0.012639776803553104, "learning_rate": 1e-06, "loss": 0.0456, "step": 691 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.1631702333688736, "epoch": 1.8210526315789473, "grad_norm": 0.009674533270299435, "learning_rate": 1e-06, "loss": 0.0075, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13431.0, "completions/mean_length": 1337.83984375, "completions/mean_terminated_length": 726.207275390625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.15106821060180664, "epoch": 1.8236842105263158, "frac_reward_zero_std": 0.4375, "grad_norm": 0.027334807440638542, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 210713706.0, "reward": 0.7495685815811157, "reward_std": 0.16618216037750244, "rewards/progression_diversity/mean": -0.0040826634503901005, "rewards/progression_diversity/std": 0.031112631782889366, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.8671875, "rewards/symbolic_reward_partial_score/std": 0.31214237213134766, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0262157917022705, "sampling/importance_sampling_ratio/min": 1.5291009569651237e-09, "sampling/sampling_logp_difference/max": 20.298585891723633, "sampling/sampling_logp_difference/mean": 0.07850104570388794, "step": 693 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.15497365593910217, "epoch": 1.8263157894736843, "grad_norm": 0.018270188942551613, "learning_rate": 1e-06, "loss": 0.0111, "step": 694 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.1553620547056198, "epoch": 1.8289473684210527, "grad_norm": 0.009259329177439213, "learning_rate": 1e-06, "loss": 0.0377, "step": 695 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.14360713958740234, "epoch": 1.831578947368421, "grad_norm": 0.010323197580873966, "learning_rate": 1e-06, "loss": 0.0499, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11742.0, "completions/mean_length": 1135.07421875, "completions/mean_terminated_length": 674.8450317382812, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.1573568657040596, "epoch": 1.8342105263157895, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03801272064447403, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 211690672.0, "reward": 0.7663776874542236, "reward_std": 0.2096462845802307, "rewards/progression_diversity/mean": -0.007742568850517273, "rewards/progression_diversity/std": 0.055101774632930756, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.88818359375, "rewards/symbolic_reward_partial_score/std": 0.2851507365703583, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0286474227905273, "sampling/importance_sampling_ratio/min": 1.8554568725903664e-07, "sampling/sampling_logp_difference/max": 15.499964714050293, "sampling/sampling_logp_difference/mean": 0.08280766010284424, "step": 697 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.15686368942260742, "epoch": 1.836842105263158, "grad_norm": 0.022794852033257484, "learning_rate": 1e-06, "loss": 0.0325, "step": 698 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.1601133495569229, "epoch": 1.8394736842105264, "grad_norm": 0.009969050996005535, "learning_rate": 1e-06, "loss": 0.0541, "step": 699 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.16635262221097946, "epoch": 1.8421052631578947, "grad_norm": 0.016408884897828102, "learning_rate": 1e-06, "loss": 0.0137, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7094.0, "completions/mean_length": 898.439453125, "completions/mean_terminated_length": 621.36181640625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.16016201674938202, "epoch": 1.844736842105263, "frac_reward_zero_std": 0.25, "grad_norm": 0.06322816014289856, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 212557809.0, "reward": 0.7900543212890625, "reward_std": 0.2123447060585022, "rewards/progression_diversity/mean": -0.003359610214829445, "rewards/progression_diversity/std": 0.036421939730644226, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9064127206802368, "rewards/symbolic_reward_partial_score/std": 0.2696720063686371, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0269652605056763, "sampling/importance_sampling_ratio/min": 2.2314340597517912e-08, "sampling/sampling_logp_difference/max": 17.6180362701416, "sampling/sampling_logp_difference/mean": 0.0807926282286644, "step": 701 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.15802810341119766, "epoch": 1.8473684210526315, "grad_norm": 0.015205918811261654, "learning_rate": 1e-06, "loss": 0.0352, "step": 702 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.16626133769750595, "epoch": 1.85, "grad_norm": 0.01964223012328148, "learning_rate": 1e-06, "loss": 0.045, "step": 703 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.1609407141804695, "epoch": 1.8526315789473684, "grad_norm": 0.01399518083781004, "learning_rate": 1e-06, "loss": 0.0803, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11244.0, "completions/mean_length": 1886.1796875, "completions/mean_terminated_length": 657.5508422851562, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.15396300703287125, "epoch": 1.8552631578947367, "frac_reward_zero_std": 0.1875, "grad_norm": 0.04543311893939972, "learning_rate": 1e-06, "loss": 0.0924, "num_tokens": 213938029.0, "reward": 0.7416077852249146, "reward_std": 0.23779910802841187, "rewards/progression_diversity/mean": -0.014027466997504234, "rewards/progression_diversity/std": 0.06849034875631332, "rewards/symbolic_reward_accuracy/mean": 0.80859375, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.86376953125, "rewards/symbolic_reward_partial_score/std": 0.31700682640075684, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0203101634979248, "sampling/importance_sampling_ratio/min": 1.0977701947467722e-07, "sampling/sampling_logp_difference/max": 16.02481460571289, "sampling/sampling_logp_difference/mean": 0.06179344281554222, "step": 705 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.1514749750494957, "epoch": 1.8578947368421053, "grad_norm": 0.024540159851312637, "learning_rate": 1e-06, "loss": 0.0743, "step": 706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.15695882588624954, "epoch": 1.8605263157894738, "grad_norm": 0.01950952038168907, "learning_rate": 1e-06, "loss": 0.0336, "step": 707 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.13617245852947235, "epoch": 1.8631578947368421, "grad_norm": 0.031919222325086594, "learning_rate": 1e-06, "loss": 0.1621, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5933.0, "completions/mean_length": 2955.310546875, "completions/mean_terminated_length": 650.6155395507812, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.1501893624663353, "epoch": 1.8657894736842104, "frac_reward_zero_std": 0.125, "grad_norm": 0.027009177953004837, "learning_rate": 1e-06, "loss": 0.1153, "num_tokens": 215862796.0, "reward": 0.6578521728515625, "reward_std": 0.2280564308166504, "rewards/progression_diversity/mean": -0.025331620126962662, "rewards/progression_diversity/std": 0.08841913938522339, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.8011067509651184, "rewards/symbolic_reward_partial_score/std": 0.3672308921813965, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0195657014846802, "sampling/importance_sampling_ratio/min": 1.3204642934638855e-10, "sampling/sampling_logp_difference/max": 22.747867584228516, "sampling/sampling_logp_difference/mean": 0.05873488262295723, "step": 709 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.14868657290935516, "epoch": 1.868421052631579, "grad_norm": 0.06350129097700119, "learning_rate": 1e-06, "loss": 0.1039, "step": 710 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.1268022656440735, "epoch": 1.8710526315789475, "grad_norm": 0.012040435336530209, "learning_rate": 1e-06, "loss": 0.2382, "step": 711 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.14944910258054733, "epoch": 1.8736842105263158, "grad_norm": 0.042405128479003906, "learning_rate": 1e-06, "loss": 0.0774, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14478.0, "completions/mean_length": 2155.14453125, "completions/mean_terminated_length": 615.22509765625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.14173594117164612, "epoch": 1.8763157894736842, "frac_reward_zero_std": 0.15625, "grad_norm": 0.10443775355815887, "learning_rate": 1e-06, "loss": 0.135, "num_tokens": 217349846.0, "reward": 0.722713828086853, "reward_std": 0.21990327537059784, "rewards/progression_diversity/mean": -0.028423257172107697, "rewards/progression_diversity/std": 0.10595274716615677, "rewards/symbolic_reward_accuracy/mean": 0.79296875, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.8435872197151184, "rewards/symbolic_reward_partial_score/std": 0.33945873379707336, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0188918113708496, "sampling/importance_sampling_ratio/min": 1.1887545392497145e-09, "sampling/sampling_logp_difference/max": 20.55035972595215, "sampling/sampling_logp_difference/mean": 0.059576526284217834, "step": 713 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.1623135283589363, "epoch": 1.8789473684210525, "grad_norm": 0.030199255794286728, "learning_rate": 1e-06, "loss": 0.1038, "step": 714 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.1539328470826149, "epoch": 1.881578947368421, "grad_norm": 0.028396856039762497, "learning_rate": 1e-06, "loss": 0.0772, "step": 715 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.14560812711715698, "epoch": 1.8842105263157896, "grad_norm": 0.03335552662611008, "learning_rate": 1e-06, "loss": 0.1204, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13825.0, "completions/mean_length": 2713.19140625, "completions/mean_terminated_length": 654.889892578125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.1502734199166298, "epoch": 1.8868421052631579, "frac_reward_zero_std": 0.03125, "grad_norm": 0.0449819378554821, "learning_rate": 1e-06, "loss": 0.1002, "num_tokens": 219142648.0, "reward": 0.6932658553123474, "reward_std": 0.27170705795288086, "rewards/progression_diversity/mean": -0.033768799155950546, "rewards/progression_diversity/std": 0.11512268334627151, "rewards/symbolic_reward_accuracy/mean": 0.763671875, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.8133138418197632, "rewards/symbolic_reward_partial_score/std": 0.36641624569892883, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0132670402526855, "sampling/importance_sampling_ratio/min": 2.8605501180398087e-15, "sampling/sampling_logp_difference/max": 33.487762451171875, "sampling/sampling_logp_difference/mean": 0.046033672988414764, "step": 717 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.1434255838394165, "epoch": 1.8894736842105262, "grad_norm": 0.020671065896749496, "learning_rate": 1e-06, "loss": 0.1992, "step": 718 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.5, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.671875, "entropy": 0.12531127035617828, "epoch": 1.8921052631578947, "grad_norm": 0.022249706089496613, "learning_rate": 1e-06, "loss": 0.2442, "step": 719 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4453125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.15144315361976624, "epoch": 1.8947368421052633, "grad_norm": 0.03603983670473099, "learning_rate": 1e-06, "loss": 0.1141, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14424.0, "completions/mean_length": 2154.701171875, "completions/mean_terminated_length": 614.7337646484375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.1503245010972023, "epoch": 1.8973684210526316, "frac_reward_zero_std": 0.125, "grad_norm": 0.14981180429458618, "learning_rate": 1e-06, "loss": 0.2187, "num_tokens": 220655679.0, "reward": 0.7320119738578796, "reward_std": 0.22837099432945251, "rewards/progression_diversity/mean": -0.021462373435497284, "rewards/progression_diversity/std": 0.08626196533441544, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.8561197519302368, "rewards/symbolic_reward_partial_score/std": 0.3233526647090912, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.019932508468628, "sampling/importance_sampling_ratio/min": 4.195763745121206e-13, "sampling/sampling_logp_difference/max": 28.499530792236328, "sampling/sampling_logp_difference/mean": 0.06518127024173737, "step": 721 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.17081984877586365, "epoch": 1.9, "grad_norm": 0.023720135912299156, "learning_rate": 1e-06, "loss": 0.0536, "step": 722 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.16384224593639374, "epoch": 1.9026315789473685, "grad_norm": 0.034469299018383026, "learning_rate": 1e-06, "loss": 0.1537, "step": 723 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.16798613965511322, "epoch": 1.905263157894737, "grad_norm": 0.03507945314049721, "learning_rate": 1e-06, "loss": 0.0875, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16240.0, "completions/mean_length": 2634.119140625, "completions/mean_terminated_length": 634.6957397460938, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.15430939197540283, "epoch": 1.9078947368421053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.09424492716789246, "learning_rate": 1e-06, "loss": 0.1251, "num_tokens": 222400316.0, "reward": 0.671546220779419, "reward_std": 0.3024430274963379, "rewards/progression_diversity/mean": -0.03776249662041664, "rewards/progression_diversity/std": 0.12152393907308578, "rewards/symbolic_reward_accuracy/mean": 0.732421875, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.79638671875, "rewards/symbolic_reward_partial_score/std": 0.3773951828479767, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0167055130004883, "sampling/importance_sampling_ratio/min": 6.6573236350974275e-15, "sampling/sampling_logp_difference/max": 32.64305877685547, "sampling/sampling_logp_difference/mean": 0.05589202791452408, "step": 725 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.14185352623462677, "epoch": 1.9105263157894736, "grad_norm": 0.06517547369003296, "learning_rate": 1e-06, "loss": 0.2122, "step": 726 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.15953659266233444, "epoch": 1.913157894736842, "grad_norm": 0.022468971088528633, "learning_rate": 1e-06, "loss": 0.1739, "step": 727 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4921875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.1624743863940239, "epoch": 1.9157894736842105, "grad_norm": 0.009907384403049946, "learning_rate": 1e-06, "loss": 0.0998, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15558.0, "completions/mean_length": 3540.01953125, "completions/mean_terminated_length": 983.2552490234375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.1347077265381813, "epoch": 1.918421052631579, "frac_reward_zero_std": 0.09375, "grad_norm": 0.13737860321998596, "learning_rate": 1e-06, "loss": 0.2226, "num_tokens": 224636646.0, "reward": 0.5830076932907104, "reward_std": 0.30949866771698, "rewards/progression_diversity/mean": -0.048849739134311676, "rewards/progression_diversity/std": 0.1297674924135208, "rewards/symbolic_reward_accuracy/mean": 0.630859375, "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, "rewards/symbolic_reward_partial_score/mean": 0.7255859375, "rewards/symbolic_reward_partial_score/std": 0.4099434018135071, "rewards/tag_count_reward/mean": -0.126953125, "rewards/tag_count_reward/std": 0.33324605226516724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0149319171905518, "sampling/importance_sampling_ratio/min": 4.369263012223404e-22, "sampling/sampling_logp_difference/max": 49.18227767944336, "sampling/sampling_logp_difference/mean": 0.04961749166250229, "step": 729 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.15040332078933716, "epoch": 1.9210526315789473, "grad_norm": 0.059476714581251144, "learning_rate": 1e-06, "loss": 0.1366, "step": 730 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.15057525038719177, "epoch": 1.9236842105263157, "grad_norm": 0.06425424665212631, "learning_rate": 1e-06, "loss": 0.2423, "step": 731 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.15043792128562927, "epoch": 1.9263157894736842, "grad_norm": 0.022221507504582405, "learning_rate": 1e-06, "loss": 0.2027, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14566.0, "completions/mean_length": 2165.96875, "completions/mean_terminated_length": 795.9229125976562, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.17250816524028778, "epoch": 1.9289473684210527, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7282646298408508, "learning_rate": 1e-06, "loss": 0.1429, "num_tokens": 226135254.0, "reward": 0.6728378534317017, "reward_std": 0.29266250133514404, "rewards/progression_diversity/mean": -0.030671168118715286, "rewards/progression_diversity/std": 0.11537665873765945, "rewards/symbolic_reward_accuracy/mean": 0.724609375, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.8147786259651184, "rewards/symbolic_reward_partial_score/std": 0.35132884979248047, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0237343311309814, "sampling/importance_sampling_ratio/min": 3.73120688677813e-20, "sampling/sampling_logp_difference/max": 44.73497009277344, "sampling/sampling_logp_difference/mean": 0.07423095405101776, "step": 733 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.18838734924793243, "epoch": 1.931578947368421, "grad_norm": 0.019116874784231186, "learning_rate": 1e-06, "loss": 0.1143, "step": 734 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.17143803089857101, "epoch": 1.9342105263157894, "grad_norm": 0.1997663974761963, "learning_rate": 1e-06, "loss": 0.1573, "step": 735 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.1800050213932991, "epoch": 1.936842105263158, "grad_norm": 0.02649916335940361, "learning_rate": 1e-06, "loss": 0.0814, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15817.0, "completions/mean_length": 2609.44921875, "completions/mean_terminated_length": 1018.9237060546875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.1816265881061554, "epoch": 1.9394736842105265, "frac_reward_zero_std": 0.09375, "grad_norm": 0.5376126170158386, "learning_rate": 1e-06, "loss": 0.1134, "num_tokens": 227868924.0, "reward": 0.6405581831932068, "reward_std": 0.2928628921508789, "rewards/progression_diversity/mean": -0.04574853554368019, "rewards/progression_diversity/std": 0.13723419606685638, "rewards/symbolic_reward_accuracy/mean": 0.69921875, "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, "rewards/symbolic_reward_partial_score/mean": 0.7682291269302368, "rewards/symbolic_reward_partial_score/std": 0.39795729517936707, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0221675634384155, "sampling/importance_sampling_ratio/min": 5.437771195999072e-18, "sampling/sampling_logp_difference/max": 39.7531623840332, "sampling/sampling_logp_difference/mean": 0.06859034299850464, "step": 737 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.18071607500314713, "epoch": 1.9421052631578948, "grad_norm": 0.05956464633345604, "learning_rate": 1e-06, "loss": 0.1105, "step": 738 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.16678573191165924, "epoch": 1.944736842105263, "grad_norm": 0.16846124827861786, "learning_rate": 1e-06, "loss": 0.1718, "step": 739 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.17628037184476852, "epoch": 1.9473684210526314, "grad_norm": 0.04649341478943825, "learning_rate": 1e-06, "loss": 0.1492, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16126.0, "completions/mean_length": 2970.525390625, "completions/mean_terminated_length": 1454.219482421875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.14562518894672394, "epoch": 1.95, "frac_reward_zero_std": 0.03125, "grad_norm": 2.966646194458008, "learning_rate": 1e-06, "loss": 0.2426, "num_tokens": 229805993.0, "reward": 0.6170042157173157, "reward_std": 0.3738676607608795, "rewards/progression_diversity/mean": -0.07204228639602661, "rewards/progression_diversity/std": 0.18077191710472107, "rewards/symbolic_reward_accuracy/mean": 0.67578125, "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, "rewards/symbolic_reward_partial_score/mean": 0.74072265625, "rewards/symbolic_reward_partial_score/std": 0.41318923234939575, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.013148307800293, "sampling/importance_sampling_ratio/min": 2.707424684788481e-31, "sampling/sampling_logp_difference/max": 70.38414001464844, "sampling/sampling_logp_difference/mean": 0.05534841865301132, "step": 741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.17404460906982422, "epoch": 1.9526315789473685, "grad_norm": 0.013007045723497868, "learning_rate": 1e-06, "loss": 0.2101, "step": 742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.16387271136045456, "epoch": 1.9552631578947368, "grad_norm": 0.044528309255838394, "learning_rate": 1e-06, "loss": 0.1907, "step": 743 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.5078125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.71875, "entropy": 0.18834584951400757, "epoch": 1.9578947368421051, "grad_norm": 0.012452390044927597, "learning_rate": 1e-06, "loss": 0.1549, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16311.0, "completions/mean_length": 3217.857421875, "completions/mean_terminated_length": 1697.5838623046875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.13727252185344696, "epoch": 1.9605263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 1.4300470352172852, "learning_rate": 1e-06, "loss": 0.2051, "num_tokens": 231876032.0, "reward": 0.619334876537323, "reward_std": 0.37329334020614624, "rewards/progression_diversity/mean": -0.07823123037815094, "rewards/progression_diversity/std": 0.17627538740634918, "rewards/symbolic_reward_accuracy/mean": 0.681640625, "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, "rewards/symbolic_reward_partial_score/mean": 0.7376302480697632, "rewards/symbolic_reward_partial_score/std": 0.4179321825504303, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.012220859527588, "sampling/importance_sampling_ratio/min": 5.182242701661106e-28, "sampling/sampling_logp_difference/max": 62.827144622802734, "sampling/sampling_logp_difference/mean": 0.056160710752010345, "step": 745 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.1606038585305214, "epoch": 1.9631578947368422, "grad_norm": 0.028117861598730087, "learning_rate": 1e-06, "loss": 0.2423, "step": 746 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.1709175780415535, "epoch": 1.9657894736842105, "grad_norm": 0.67425537109375, "learning_rate": 1e-06, "loss": 0.2054, "step": 747 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.15508843958377838, "epoch": 1.9684210526315788, "grad_norm": 0.06400001794099808, "learning_rate": 1e-06, "loss": 0.2487, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15746.0, "completions/mean_length": 2000.880859375, "completions/mean_terminated_length": 1293.5142822265625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.21451418101787567, "epoch": 1.9710526315789474, "frac_reward_zero_std": 0.0, "grad_norm": 5.9699931144714355, "learning_rate": 1e-06, "loss": 0.0579, "num_tokens": 233257411.0, "reward": 0.651739239692688, "reward_std": 0.3094671964645386, "rewards/progression_diversity/mean": -0.0653361827135086, "rewards/progression_diversity/std": 0.19041091203689575, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.76904296875, "rewards/symbolic_reward_partial_score/std": 0.39429956674575806, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.021796703338623, "sampling/importance_sampling_ratio/min": 8.134976418108564e-29, "sampling/sampling_logp_difference/max": 64.67879486083984, "sampling/sampling_logp_difference/mean": 0.07527394592761993, "step": 749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.5078125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.17407967895269394, "epoch": 1.973684210526316, "grad_norm": 0.060622621327638626, "learning_rate": 1e-06, "loss": 0.1803, "step": 750 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.5859375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.7265625, "entropy": 0.19498290121555328, "epoch": 1.9763157894736842, "grad_norm": 0.015352616086602211, "learning_rate": 1e-06, "loss": 0.181, "step": 751 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.6875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.78125, "entropy": 0.24272602796554565, "epoch": 1.9789473684210526, "grad_norm": 0.01805986650288105, "learning_rate": 1e-06, "loss": 0.0971, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14517.0, "completions/mean_length": 2323.900390625, "completions/mean_terminated_length": 1448.79052734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.18802518397569656, "epoch": 1.981578947368421, "frac_reward_zero_std": 0.09375, "grad_norm": 1.341567039489746, "learning_rate": 1e-06, "loss": 0.1657, "num_tokens": 234866896.0, "reward": 0.6480075120925903, "reward_std": 0.29000890254974365, "rewards/progression_diversity/mean": -0.08694960176944733, "rewards/progression_diversity/std": 0.21665886044502258, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.7762044072151184, "rewards/symbolic_reward_partial_score/std": 0.3934388756752014, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.02323579788208, "sampling/importance_sampling_ratio/min": 1.605486811297672e-22, "sampling/sampling_logp_difference/max": 50.18344497680664, "sampling/sampling_logp_difference/mean": 0.08397974073886871, "step": 753 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21599113941192627, "epoch": 1.9842105263157894, "grad_norm": 0.01323725562542677, "learning_rate": 1e-06, "loss": 0.1011, "step": 754 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22974644601345062, "epoch": 1.986842105263158, "grad_norm": 0.02560954913496971, "learning_rate": 1e-06, "loss": 0.1258, "step": 755 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.24319110810756683, "epoch": 1.9894736842105263, "grad_norm": 0.5963929891586304, "learning_rate": 1e-06, "loss": 0.1971, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14817.0, "completions/mean_length": 1997.1171875, "completions/mean_terminated_length": 1164.818115234375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.22843757271766663, "epoch": 1.9921052631578946, "frac_reward_zero_std": 0.09375, "grad_norm": 1.2587209939956665, "learning_rate": 1e-06, "loss": 0.0798, "num_tokens": 236294732.0, "reward": 0.694200873374939, "reward_std": 0.2883387506008148, "rewards/progression_diversity/mean": -0.07210341095924377, "rewards/progression_diversity/std": 0.2020467221736908, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.8216145634651184, "rewards/symbolic_reward_partial_score/std": 0.3572600483894348, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0223069190979004, "sampling/importance_sampling_ratio/min": 8.752450397611482e-24, "sampling/sampling_logp_difference/max": 53.092708587646484, "sampling/sampling_logp_difference/mean": 0.10721838474273682, "step": 757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.244800366461277, "epoch": 1.9947368421052631, "grad_norm": 0.015508892014622688, "learning_rate": 1e-06, "loss": 0.1874, "step": 758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.26591283082962036, "epoch": 1.9973684210526317, "grad_norm": 0.013314544223248959, "learning_rate": 1e-06, "loss": 0.1516, "step": 759 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2783430367708206, "epoch": 2.0, "grad_norm": 0.01680191606283188, "learning_rate": 1e-06, "loss": 0.1542, "step": 760 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.020263671875, "eval_completions/max_length": 15967.3125, "eval_completions/max_terminated_length": 12734.125, "eval_completions/mean_length": 1709.68359375, "eval_completions/mean_terminated_length": 1407.859390258789, "eval_completions/min_length": 175.84375, "eval_completions/min_terminated_length": 175.84375, "eval_entropy": 0.18563631316646934, "eval_frac_reward_zero_std": 0.09375, "eval_loss": 0.05905697122216225, "eval_num_tokens": 236294732.0, "eval_reward": 0.6300447043031454, "eval_reward_std": 0.32271731412038207, "eval_rewards/progression_diversity/mean": -0.08879435807466507, "eval_rewards/progression_diversity/std": 0.2320320950821042, "eval_rewards/symbolic_reward_accuracy/mean": 0.681396484375, "eval_rewards/symbolic_reward_accuracy/std": 0.45585051737725735, "eval_rewards/symbolic_reward_partial_score/mean": 0.7582194041460752, "eval_rewards/symbolic_reward_partial_score/std": 0.38103948533535004, "eval_rewards/tag_count_reward/mean": -0.0537109375, "eval_rewards/tag_count_reward/std": 0.22261275839991868, "eval_runtime": 3967.3544, "eval_samples_per_second": 0.063, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0347554087638855, "eval_sampling/importance_sampling_ratio/min": 1.7375132229474076e-18, "eval_sampling/sampling_logp_difference/max": 45.634475350379944, "eval_sampling/sampling_logp_difference/mean": 0.12127477861940861, "eval_steps_per_second": 0.001, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16118.0, "completions/mean_length": 1692.767578125, "completions/mean_terminated_length": 1340.1781005859375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.22220094501972198, "epoch": 2.0026315789473683, "frac_reward_zero_std": 0.1875, "grad_norm": 3.1163506507873535, "learning_rate": 1e-06, "loss": 0.0849, "num_tokens": 237554389.0, "reward": 0.6846272945404053, "reward_std": 0.28297704458236694, "rewards/progression_diversity/mean": -0.07242967188358307, "rewards/progression_diversity/std": 0.20485153794288635, "rewards/symbolic_reward_accuracy/mean": 0.744140625, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.8125, "rewards/symbolic_reward_partial_score/std": 0.35843971371650696, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0238771438598633, "sampling/importance_sampling_ratio/min": 3.568013560231561e-26, "sampling/sampling_logp_difference/max": 58.5952033996582, "sampling/sampling_logp_difference/mean": 0.12895509600639343, "step": 761 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2493431717157364, "epoch": 2.0052631578947366, "grad_norm": 0.010103636421263218, "learning_rate": 1e-06, "loss": 0.165, "step": 762 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3100139796733856, "epoch": 2.0078947368421054, "grad_norm": 0.04030955210328102, "learning_rate": 1e-06, "loss": 0.149, "step": 763 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.3300608843564987, "epoch": 2.0105263157894737, "grad_norm": 0.010126029141247272, "learning_rate": 1e-06, "loss": 0.0765, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14986.0, "completions/mean_length": 1680.546875, "completions/mean_terminated_length": 1447.1588134765625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.21983007341623306, "epoch": 2.013157894736842, "frac_reward_zero_std": 0.15625, "grad_norm": 3.134493589401245, "learning_rate": 1e-06, "loss": 0.1212, "num_tokens": 238821645.0, "reward": 0.6834567785263062, "reward_std": 0.27827733755111694, "rewards/progression_diversity/mean": -0.07717426866292953, "rewards/progression_diversity/std": 0.20578458905220032, "rewards/symbolic_reward_accuracy/mean": 0.7421875, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.81396484375, "rewards/symbolic_reward_partial_score/std": 0.35770609974861145, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0259695053100586, "sampling/importance_sampling_ratio/min": 3.113477761470245e-26, "sampling/sampling_logp_difference/max": 58.73147201538086, "sampling/sampling_logp_difference/mean": 0.13548921048641205, "step": 765 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.3355230987071991, "epoch": 2.0157894736842104, "grad_norm": 0.014872807078063488, "learning_rate": 1e-06, "loss": 0.0679, "step": 766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.5302972793579102, "epoch": 2.018421052631579, "grad_norm": 0.02062629908323288, "learning_rate": 1e-06, "loss": 0.0955, "step": 767 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.718237966299057, "epoch": 2.0210526315789474, "grad_norm": 0.02212103269994259, "learning_rate": 1e-06, "loss": 0.1666, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15111.0, "completions/mean_length": 1686.498046875, "completions/mean_terminated_length": 1242.911376953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.23345400393009186, "epoch": 2.0236842105263158, "frac_reward_zero_std": 0.0625, "grad_norm": 0.634035050868988, "learning_rate": 1e-06, "loss": 0.074, "num_tokens": 240076428.0, "reward": 0.6818051934242249, "reward_std": 0.29768824577331543, "rewards/progression_diversity/mean": -0.07144007831811905, "rewards/progression_diversity/std": 0.19266043603420258, "rewards/symbolic_reward_accuracy/mean": 0.740234375, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.8134765625, "rewards/symbolic_reward_partial_score/std": 0.3538402020931244, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0273222923278809, "sampling/importance_sampling_ratio/min": 1.1702893084804623e-27, "sampling/sampling_logp_difference/max": 62.01254653930664, "sampling/sampling_logp_difference/mean": 0.14426520466804504, "step": 769 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2532157674431801, "epoch": 2.026315789473684, "grad_norm": 0.07092837989330292, "learning_rate": 1e-06, "loss": 0.1038, "step": 770 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3199596703052521, "epoch": 2.028947368421053, "grad_norm": 0.031068623065948486, "learning_rate": 1e-06, "loss": 0.0876, "step": 771 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3354818820953369, "epoch": 2.031578947368421, "grad_norm": 0.03109137900173664, "learning_rate": 1e-06, "loss": 0.1269, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16351.0, "completions/mean_length": 2114.2578125, "completions/mean_terminated_length": 1624.1859130859375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.2218097448348999, "epoch": 2.0342105263157895, "frac_reward_zero_std": 0.0625, "grad_norm": 11.015893936157227, "learning_rate": 1e-06, "loss": 0.1472, "num_tokens": 241569232.0, "reward": 0.6609280109405518, "reward_std": 0.3157079815864563, "rewards/progression_diversity/mean": -0.09372584521770477, "rewards/progression_diversity/std": 0.21451076865196228, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.7921549081802368, "rewards/symbolic_reward_partial_score/std": 0.37911295890808105, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.011353611946106, "sampling/importance_sampling_ratio/min": 4.881230540264573e-27, "sampling/sampling_logp_difference/max": 60.58440017700195, "sampling/sampling_logp_difference/mean": 0.21283593773841858, "step": 773 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.484375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6640625, "entropy": 0.29898831248283386, "epoch": 2.036842105263158, "grad_norm": 0.006906085181981325, "learning_rate": 1e-06, "loss": 0.1731, "step": 774 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.5546875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6953125, "entropy": 0.378355011343956, "epoch": 2.039473684210526, "grad_norm": 0.26340046525001526, "learning_rate": 1e-06, "loss": 0.1015, "step": 775 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.546875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.75, "entropy": 0.5297541320323944, "epoch": 2.042105263157895, "grad_norm": 0.008265807293355465, "learning_rate": 1e-06, "loss": 0.1864, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13564.0, "completions/mean_length": 1399.826171875, "completions/mean_terminated_length": 1101.336669921875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23068619519472122, "epoch": 2.044736842105263, "frac_reward_zero_std": 0.1875, "grad_norm": 4.750518321990967, "learning_rate": 1e-06, "loss": 0.1014, "num_tokens": 242665047.0, "reward": 0.7356716394424438, "reward_std": 0.2675522565841675, "rewards/progression_diversity/mean": -0.05099809169769287, "rewards/progression_diversity/std": 0.1577788144350052, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.8575845956802368, "rewards/symbolic_reward_partial_score/std": 0.3261964023113251, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.020838737487793, "sampling/importance_sampling_ratio/min": 2.984163448692399e-29, "sampling/sampling_logp_difference/max": 65.68164825439453, "sampling/sampling_logp_difference/mean": 0.23801082372665405, "step": 777 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.27487921714782715, "epoch": 2.0473684210526315, "grad_norm": 0.010955928824841976, "learning_rate": 1e-06, "loss": 0.1502, "step": 778 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3157334327697754, "epoch": 2.05, "grad_norm": 0.04367856681346893, "learning_rate": 1e-06, "loss": 0.0794, "step": 779 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.31336382031440735, "epoch": 2.0526315789473686, "grad_norm": 0.021015914157032967, "learning_rate": 1e-06, "loss": 0.0561, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15221.0, "completions/mean_length": 1509.474609375, "completions/mean_terminated_length": 1152.486083984375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.22252874076366425, "epoch": 2.055263157894737, "frac_reward_zero_std": 0.09375, "grad_norm": 1.6408276557922363, "learning_rate": 1e-06, "loss": 0.1134, "num_tokens": 243863402.0, "reward": 0.7193964123725891, "reward_std": 0.28412267565727234, "rewards/progression_diversity/mean": -0.037899717688560486, "rewards/progression_diversity/std": 0.12527695298194885, "rewards/symbolic_reward_accuracy/mean": 0.78515625, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.8458659052848816, "rewards/symbolic_reward_partial_score/std": 0.32760530710220337, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0266220569610596, "sampling/importance_sampling_ratio/min": 4.3695186577606616e-27, "sampling/sampling_logp_difference/max": 60.69514465332031, "sampling/sampling_logp_difference/mean": 0.12954550981521606, "step": 781 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22228249162435532, "epoch": 2.057894736842105, "grad_norm": 0.19859381020069122, "learning_rate": 1e-06, "loss": 0.131, "step": 782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.20911472290754318, "epoch": 2.0605263157894735, "grad_norm": 0.02292071282863617, "learning_rate": 1e-06, "loss": 0.1517, "step": 783 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2361936792731285, "epoch": 2.0631578947368423, "grad_norm": 0.015427610836923122, "learning_rate": 1e-06, "loss": 0.0483, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15893.0, "completions/mean_length": 1741.46875, "completions/mean_terminated_length": 1238.593994140625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.21045714616775513, "epoch": 2.0657894736842106, "frac_reward_zero_std": 0.125, "grad_norm": 4.345335006713867, "learning_rate": 1e-06, "loss": 0.1222, "num_tokens": 245182202.0, "reward": 0.7289832830429077, "reward_std": 0.29325735569000244, "rewards/progression_diversity/mean": -0.046007223427295685, "rewards/progression_diversity/std": 0.139408677816391, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.8435872793197632, "rewards/symbolic_reward_partial_score/std": 0.34268614649772644, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0198302268981934, "sampling/importance_sampling_ratio/min": 9.978149817821409e-34, "sampling/sampling_logp_difference/max": 75.98749542236328, "sampling/sampling_logp_difference/mean": 0.21147558093070984, "step": 785 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2157856523990631, "epoch": 2.068421052631579, "grad_norm": 0.01274153497070074, "learning_rate": 1e-06, "loss": 0.2087, "step": 786 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2510286718606949, "epoch": 2.0710526315789473, "grad_norm": 0.06488237529993057, "learning_rate": 1e-06, "loss": 0.1405, "step": 787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2752929925918579, "epoch": 2.0736842105263156, "grad_norm": 0.018524806946516037, "learning_rate": 1e-06, "loss": 0.1161, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14714.0, "completions/mean_length": 1852.96875, "completions/mean_terminated_length": 1262.2763671875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.22035640478134155, "epoch": 2.0763157894736843, "frac_reward_zero_std": 0.1875, "grad_norm": 3.2875747680664062, "learning_rate": 1e-06, "loss": 0.1068, "num_tokens": 246517706.0, "reward": 0.6830227971076965, "reward_std": 0.23248738050460815, "rewards/progression_diversity/mean": -0.04245023429393768, "rewards/progression_diversity/std": 0.12012242525815964, "rewards/symbolic_reward_accuracy/mean": 0.740234375, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.8165690302848816, "rewards/symbolic_reward_partial_score/std": 0.35112887620925903, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.023137092590332, "sampling/importance_sampling_ratio/min": 1.018668067894736e-33, "sampling/sampling_logp_difference/max": 75.96681213378906, "sampling/sampling_logp_difference/mean": 0.3000330924987793, "step": 789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.453125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.23726185411214828, "epoch": 2.0789473684210527, "grad_norm": 0.875003457069397, "learning_rate": 1e-06, "loss": 0.1011, "step": 790 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.3143727779388428, "epoch": 2.081578947368421, "grad_norm": 0.024129139259457588, "learning_rate": 1e-06, "loss": 0.1284, "step": 791 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4921875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.2929740250110626, "epoch": 2.0842105263157893, "grad_norm": 0.013345804065465927, "learning_rate": 1e-06, "loss": 0.0514, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15700.0, "completions/mean_length": 1659.0625, "completions/mean_terminated_length": 903.1622314453125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.20717183500528336, "epoch": 2.086842105263158, "frac_reward_zero_std": 0.21875, "grad_norm": 6.014965057373047, "learning_rate": 1e-06, "loss": 0.1198, "num_tokens": 247780298.0, "reward": 0.7235679626464844, "reward_std": 0.24241438508033752, "rewards/progression_diversity/mean": -0.030903812497854233, "rewards/progression_diversity/std": 0.11096394807100296, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.85693359375, "rewards/symbolic_reward_partial_score/std": 0.3159598112106323, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0264015197753906, "sampling/importance_sampling_ratio/min": 1.2161178811991868e-37, "sampling/sampling_logp_difference/max": 84.99998474121094, "sampling/sampling_logp_difference/mean": 0.20792850852012634, "step": 793 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20646440982818604, "epoch": 2.0894736842105264, "grad_norm": 0.08151978254318237, "learning_rate": 1e-06, "loss": 0.1191, "step": 794 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.22666236013174057, "epoch": 2.0921052631578947, "grad_norm": 0.016352776437997818, "learning_rate": 1e-06, "loss": 0.1051, "step": 795 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.23344239592552185, "epoch": 2.094736842105263, "grad_norm": 0.0139634869992733, "learning_rate": 1e-06, "loss": 0.1076, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16007.0, "completions/mean_length": 1441.34375, "completions/mean_terminated_length": 959.3225708007812, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.23309873044490814, "epoch": 2.0973684210526318, "frac_reward_zero_std": 0.3125, "grad_norm": 4.238341808319092, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 248903546.0, "reward": 0.7395845651626587, "reward_std": 0.195578932762146, "rewards/progression_diversity/mean": -0.030801229178905487, "rewards/progression_diversity/std": 0.1092456728219986, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.8758137822151184, "rewards/symbolic_reward_partial_score/std": 0.29930591583251953, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0348535776138306, "sampling/importance_sampling_ratio/min": 7.160635152699815e-43, "sampling/sampling_logp_difference/max": 97.04222106933594, "sampling/sampling_logp_difference/mean": 0.22585123777389526, "step": 797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22810588777065277, "epoch": 2.1, "grad_norm": 0.013128053396940231, "learning_rate": 1e-06, "loss": 0.0723, "step": 798 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.24390096962451935, "epoch": 2.1026315789473684, "grad_norm": 0.007489512674510479, "learning_rate": 1e-06, "loss": 0.1571, "step": 799 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22683186829090118, "epoch": 2.1052631578947367, "grad_norm": 0.012858620844781399, "learning_rate": 1e-06, "loss": 0.128, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13867.0, "completions/mean_length": 1304.953125, "completions/mean_terminated_length": 849.85107421875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2136317864060402, "epoch": 2.1078947368421055, "frac_reward_zero_std": 0.1875, "grad_norm": 9.192523002624512, "learning_rate": 1e-06, "loss": 0.1348, "num_tokens": 249969474.0, "reward": 0.7585318684577942, "reward_std": 0.23733538389205933, "rewards/progression_diversity/mean": -0.030604764819145203, "rewards/progression_diversity/std": 0.1174231469631195, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.8914388418197632, "rewards/symbolic_reward_partial_score/std": 0.2718009948730469, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0298779010772705, "sampling/importance_sampling_ratio/min": 2.0962023727834939e-41, "sampling/sampling_logp_difference/max": 93.6658706665039, "sampling/sampling_logp_difference/mean": 0.1556258201599121, "step": 801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20574460923671722, "epoch": 2.110526315789474, "grad_norm": 0.00943561177700758, "learning_rate": 1e-06, "loss": 0.1348, "step": 802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22100205719470978, "epoch": 2.113157894736842, "grad_norm": 0.018543722108006477, "learning_rate": 1e-06, "loss": 0.0386, "step": 803 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23784782737493515, "epoch": 2.1157894736842104, "grad_norm": 0.1003841683268547, "learning_rate": 1e-06, "loss": 0.0778, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13121.0, "completions/mean_length": 1769.97265625, "completions/mean_terminated_length": 860.3859252929688, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.22240455448627472, "epoch": 2.1184210526315788, "frac_reward_zero_std": 0.125, "grad_norm": 10.562141418457031, "learning_rate": 1e-06, "loss": 0.1285, "num_tokens": 251296532.0, "reward": 0.736179769039154, "reward_std": 0.24343962967395782, "rewards/progression_diversity/mean": -0.03436863049864769, "rewards/progression_diversity/std": 0.12254931777715683, "rewards/symbolic_reward_accuracy/mean": 0.8046875, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.8671875, "rewards/symbolic_reward_partial_score/std": 0.31175029277801514, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0263206958770752, "sampling/importance_sampling_ratio/min": 1.7586295727276454e-42, "sampling/sampling_logp_difference/max": 96.14400482177734, "sampling/sampling_logp_difference/mean": 0.2061021327972412, "step": 805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.215851292014122, "epoch": 2.1210526315789475, "grad_norm": 0.012161717750132084, "learning_rate": 1e-06, "loss": 0.1275, "step": 806 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.20706567913293839, "epoch": 2.123684210526316, "grad_norm": 0.007896292954683304, "learning_rate": 1e-06, "loss": 0.1679, "step": 807 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.21126502752304077, "epoch": 2.126315789473684, "grad_norm": 0.007001427933573723, "learning_rate": 1e-06, "loss": 0.1365, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11642.0, "completions/mean_length": 1819.16796875, "completions/mean_terminated_length": 848.17919921875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.20564715564250946, "epoch": 2.1289473684210525, "frac_reward_zero_std": 0.25, "grad_norm": 6.921549320220947, "learning_rate": 1e-06, "loss": 0.1125, "num_tokens": 252653450.0, "reward": 0.7197491526603699, "reward_std": 0.20420284569263458, "rewards/progression_diversity/mean": -0.04169023782014847, "rewards/progression_diversity/std": 0.13636480271816254, "rewards/symbolic_reward_accuracy/mean": 0.779296875, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.8673502206802368, "rewards/symbolic_reward_partial_score/std": 0.3025065064430237, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0277680158615112, "sampling/importance_sampling_ratio/min": 5.605193857299268e-45, "sampling/sampling_logp_difference/max": 102.0, "sampling/sampling_logp_difference/mean": 0.18684858083724976, "step": 809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20571620762348175, "epoch": 2.1315789473684212, "grad_norm": 0.033720117062330246, "learning_rate": 1e-06, "loss": 0.1593, "step": 810 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.22204747796058655, "epoch": 2.1342105263157896, "grad_norm": 0.20377802848815918, "learning_rate": 1e-06, "loss": 0.1086, "step": 811 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.20914125442504883, "epoch": 2.136842105263158, "grad_norm": 0.025145720690488815, "learning_rate": 1e-06, "loss": 0.1232, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15292.0, "completions/mean_length": 1469.455078125, "completions/mean_terminated_length": 988.3406982421875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.21566122770309448, "epoch": 2.139473684210526, "frac_reward_zero_std": 0.21875, "grad_norm": 1.5204887390136719, "learning_rate": 1e-06, "loss": 0.0784, "num_tokens": 253832819.0, "reward": 0.7044533491134644, "reward_std": 0.23354381322860718, "rewards/progression_diversity/mean": -0.033183254301548004, "rewards/progression_diversity/std": 0.1218782439827919, "rewards/symbolic_reward_accuracy/mean": 0.75, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.8701171875, "rewards/symbolic_reward_partial_score/std": 0.28412559628486633, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0307013988494873, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 105.5, "sampling/sampling_logp_difference/mean": 0.2200557291507721, "step": 813 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2126343920826912, "epoch": 2.1421052631578945, "grad_norm": 0.019540801644325256, "learning_rate": 1e-06, "loss": 0.1643, "step": 814 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2253113090991974, "epoch": 2.1447368421052633, "grad_norm": 0.034518882632255554, "learning_rate": 1e-06, "loss": 0.0848, "step": 815 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.21958737820386887, "epoch": 2.1473684210526316, "grad_norm": 0.022237155586481094, "learning_rate": 1e-06, "loss": 0.0616, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16089.0, "completions/mean_length": 1241.01171875, "completions/mean_terminated_length": 1091.672607421875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2650395929813385, "epoch": 2.15, "frac_reward_zero_std": 0.25, "grad_norm": 8.908076286315918, "learning_rate": 1e-06, "loss": 0.0686, "num_tokens": 254887545.0, "reward": 0.7385146617889404, "reward_std": 0.21202999353408813, "rewards/progression_diversity/mean": -0.04013665392994881, "rewards/progression_diversity/std": 0.136456698179245, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.8680013418197632, "rewards/symbolic_reward_partial_score/std": 0.3045817017555237, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0395100116729736, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 112.35920715332031, "sampling/sampling_logp_difference/mean": 0.4265938699245453, "step": 817 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2493005096912384, "epoch": 2.1526315789473682, "grad_norm": 0.020062845200300217, "learning_rate": 1e-06, "loss": 0.1432, "step": 818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23568686097860336, "epoch": 2.155263157894737, "grad_norm": 0.028896203264594078, "learning_rate": 1e-06, "loss": 0.1101, "step": 819 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24035780876874924, "epoch": 2.1578947368421053, "grad_norm": 0.01008315198123455, "learning_rate": 1e-06, "loss": 0.0286, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12528.0, "completions/mean_length": 1109.306640625, "completions/mean_terminated_length": 958.6686401367188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3168548345565796, "epoch": 2.1605263157894736, "frac_reward_zero_std": 0.28125, "grad_norm": 7.0420708656311035, "learning_rate": 1e-06, "loss": 0.079, "num_tokens": 255865494.0, "reward": 0.7541958093643188, "reward_std": 0.2114800214767456, "rewards/progression_diversity/mean": -0.03452695906162262, "rewards/progression_diversity/std": 0.1269940882921219, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.88818359375, "rewards/symbolic_reward_partial_score/std": 0.2839567959308624, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0436235666275024, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 119.89215850830078, "sampling/sampling_logp_difference/mean": 0.4233931005001068, "step": 821 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2538767158985138, "epoch": 2.163157894736842, "grad_norm": 0.014526182785630226, "learning_rate": 1e-06, "loss": 0.0951, "step": 822 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2911252975463867, "epoch": 2.1657894736842107, "grad_norm": 0.010871890932321548, "learning_rate": 1e-06, "loss": 0.0845, "step": 823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2438528761267662, "epoch": 2.168421052631579, "grad_norm": 0.008227149024605751, "learning_rate": 1e-06, "loss": 0.1009, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13903.0, "completions/mean_length": 1524.4453125, "completions/mean_terminated_length": 1198.1876220703125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.32260115444660187, "epoch": 2.1710526315789473, "frac_reward_zero_std": 0.28125, "grad_norm": 18.730411529541016, "learning_rate": 1e-06, "loss": 0.0989, "num_tokens": 257055834.0, "reward": 0.7190285921096802, "reward_std": 0.2212091088294983, "rewards/progression_diversity/mean": -0.04050877317786217, "rewards/progression_diversity/std": 0.1305409073829651, "rewards/symbolic_reward_accuracy/mean": 0.78515625, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.8544921875, "rewards/symbolic_reward_partial_score/std": 0.31850993633270264, "rewards/tag_count_reward/mean": -0.080078125, "rewards/tag_count_reward/std": 0.271679550409317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0542526245117188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 130.9848175048828, "sampling/sampling_logp_difference/mean": 0.42119100689888, "step": 825 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.28623804450035095, "epoch": 2.1736842105263157, "grad_norm": 0.009265235625207424, "learning_rate": 1e-06, "loss": 0.0857, "step": 826 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3048030138015747, "epoch": 2.1763157894736844, "grad_norm": 0.5141742825508118, "learning_rate": 1e-06, "loss": 0.0416, "step": 827 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.33180132508277893, "epoch": 2.1789473684210527, "grad_norm": 0.01808958500623703, "learning_rate": 1e-06, "loss": 0.1253, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14963.0, "completions/mean_length": 1386.966796875, "completions/mean_terminated_length": 996.2625122070312, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3035607635974884, "epoch": 2.181578947368421, "frac_reward_zero_std": 0.28125, "grad_norm": 20.19339370727539, "learning_rate": 1e-06, "loss": 0.1055, "num_tokens": 258146409.0, "reward": 0.7440898418426514, "reward_std": 0.20957735180854797, "rewards/progression_diversity/mean": -0.029496124014258385, "rewards/progression_diversity/std": 0.10593532025814056, "rewards/symbolic_reward_accuracy/mean": 0.814453125, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.8712565302848816, "rewards/symbolic_reward_partial_score/std": 0.30432677268981934, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059896469116211, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 135.2486572265625, "sampling/sampling_logp_difference/mean": 0.5292673707008362, "step": 829 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.31164059042930603, "epoch": 2.1842105263157894, "grad_norm": 4.756748199462891, "learning_rate": 1e-06, "loss": 0.0966, "step": 830 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.32979193329811096, "epoch": 2.1868421052631577, "grad_norm": 0.020023655146360397, "learning_rate": 1e-06, "loss": 0.1029, "step": 831 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.30019381642341614, "epoch": 2.1894736842105265, "grad_norm": 0.01554365735501051, "learning_rate": 1e-06, "loss": 0.0552, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14676.0, "completions/mean_length": 1498.107421875, "completions/mean_terminated_length": 955.7064819335938, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4383620619773865, "epoch": 2.192105263157895, "frac_reward_zero_std": 0.21875, "grad_norm": 26.805294036865234, "learning_rate": 1e-06, "loss": 0.1879, "num_tokens": 259308064.0, "reward": 0.708129346370697, "reward_std": 0.2203439176082611, "rewards/progression_diversity/mean": -0.03179560601711273, "rewards/progression_diversity/std": 0.11374954134225845, "rewards/symbolic_reward_accuracy/mean": 0.765625, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.8465169072151184, "rewards/symbolic_reward_partial_score/std": 0.31652384996414185, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.056193470954895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 135.6931915283203, "sampling/sampling_logp_difference/mean": 0.8543011546134949, "step": 833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3004182279109955, "epoch": 2.194736842105263, "grad_norm": 0.017624402418732643, "learning_rate": 1e-06, "loss": 0.0347, "step": 834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3438268452882767, "epoch": 2.1973684210526314, "grad_norm": 0.015222882851958275, "learning_rate": 1e-06, "loss": 0.1393, "step": 835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.32134924829006195, "epoch": 2.2, "grad_norm": 0.017897766083478928, "learning_rate": 1e-06, "loss": 0.0976, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13689.0, "completions/mean_length": 1387.171875, "completions/mean_terminated_length": 903.4031982421875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.41330593824386597, "epoch": 2.2026315789473685, "frac_reward_zero_std": 0.375, "grad_norm": 17.12291717529297, "learning_rate": 1e-06, "loss": 0.1254, "num_tokens": 260406584.0, "reward": 0.7749391794204712, "reward_std": 0.17058953642845154, "rewards/progression_diversity/mean": -0.020733974874019623, "rewards/progression_diversity/std": 0.08155296742916107, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.8982747793197632, "rewards/symbolic_reward_partial_score/std": 0.27264928817749023, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0631152391433716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 136.99832153320312, "sampling/sampling_logp_difference/mean": 0.695594310760498, "step": 837 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.40689145028591156, "epoch": 2.205263157894737, "grad_norm": 0.004986094310879707, "learning_rate": 1e-06, "loss": 0.1718, "step": 838 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3304235339164734, "epoch": 2.207894736842105, "grad_norm": 0.013745912350714207, "learning_rate": 1e-06, "loss": 0.0868, "step": 839 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.32582440972328186, "epoch": 2.2105263157894735, "grad_norm": 0.00617125304415822, "learning_rate": 1e-06, "loss": 0.0352, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13147.0, "completions/mean_length": 1191.048828125, "completions/mean_terminated_length": 605.5192260742188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3709278106689453, "epoch": 2.213157894736842, "frac_reward_zero_std": 0.25, "grad_norm": 21.482324600219727, "learning_rate": 1e-06, "loss": 0.1633, "num_tokens": 261411185.0, "reward": 0.7811179161071777, "reward_std": 0.21279799938201904, "rewards/progression_diversity/mean": -0.018096236512064934, "rewards/progression_diversity/std": 0.08493653684854507, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9129231572151184, "rewards/symbolic_reward_partial_score/std": 0.247787743806839, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0535836219787598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 139.99432373046875, "sampling/sampling_logp_difference/mean": 1.1103196144104004, "step": 841 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3213431090116501, "epoch": 2.2157894736842105, "grad_norm": 0.012497864663600922, "learning_rate": 1e-06, "loss": 0.0665, "step": 842 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.3591870814561844, "epoch": 2.218421052631579, "grad_norm": 0.010890123434364796, "learning_rate": 1e-06, "loss": 0.1304, "step": 843 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2970151901245117, "epoch": 2.221052631578947, "grad_norm": 0.11356399953365326, "learning_rate": 1e-06, "loss": 0.0636, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14047.0, "completions/mean_length": 1340.443359375, "completions/mean_terminated_length": 568.1868896484375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.33584554493427277, "epoch": 2.223684210526316, "frac_reward_zero_std": 0.375, "grad_norm": 19.238874435424805, "learning_rate": 1e-06, "loss": 0.1027, "num_tokens": 262479924.0, "reward": 0.7776129245758057, "reward_std": 0.1951584815979004, "rewards/progression_diversity/mean": -0.01703125424683094, "rewards/progression_diversity/std": 0.0720345601439476, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.90380859375, "rewards/symbolic_reward_partial_score/std": 0.2672818601131439, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0570869445800781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 143.1820831298828, "sampling/sampling_logp_difference/mean": 0.9214514493942261, "step": 845 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3423171490430832, "epoch": 2.2263157894736842, "grad_norm": 0.010278112255036831, "learning_rate": 1e-06, "loss": 0.1566, "step": 846 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3419143259525299, "epoch": 2.2289473684210526, "grad_norm": 0.011319981887936592, "learning_rate": 1e-06, "loss": 0.0796, "step": 847 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3335099071264267, "epoch": 2.231578947368421, "grad_norm": 0.015382968820631504, "learning_rate": 1e-06, "loss": 0.0699, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14075.0, "completions/mean_length": 1389.193359375, "completions/mean_terminated_length": 811.3001708984375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.35164597630500793, "epoch": 2.2342105263157896, "frac_reward_zero_std": 0.3125, "grad_norm": 7.93258810043335, "learning_rate": 1e-06, "loss": 0.0673, "num_tokens": 263594679.0, "reward": 0.7638546228408813, "reward_std": 0.19747570157051086, "rewards/progression_diversity/mean": -0.025676485151052475, "rewards/progression_diversity/std": 0.09972041100263596, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, "rewards/symbolic_reward_partial_score/std": 0.2742253541946411, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.053678035736084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 163.0, "sampling/sampling_logp_difference/mean": 1.083064317703247, "step": 849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3196953535079956, "epoch": 2.236842105263158, "grad_norm": 0.17636702954769135, "learning_rate": 1e-06, "loss": 0.0405, "step": 850 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.37696947157382965, "epoch": 2.2394736842105263, "grad_norm": 0.02072129212319851, "learning_rate": 1e-06, "loss": 0.1117, "step": 851 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3250402361154556, "epoch": 2.2421052631578946, "grad_norm": 0.014444287866353989, "learning_rate": 1e-06, "loss": 0.1048, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13668.0, "completions/mean_length": 1304.669921875, "completions/mean_terminated_length": 627.6387329101562, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3698349595069885, "epoch": 2.2447368421052634, "frac_reward_zero_std": 0.375, "grad_norm": 217.0945587158203, "learning_rate": 1e-06, "loss": 0.1591, "num_tokens": 264646126.0, "reward": 0.7918961048126221, "reward_std": 0.1911715716123581, "rewards/progression_diversity/mean": -0.019374651834368706, "rewards/progression_diversity/std": 0.08247331529855728, "rewards/symbolic_reward_accuracy/mean": 0.873046875, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, "rewards/symbolic_reward_partial_score/std": 0.2634342610836029, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0525051355361938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 169.99769592285156, "sampling/sampling_logp_difference/mean": 1.494558334350586, "step": 853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30978524684906006, "epoch": 2.2473684210526317, "grad_norm": 0.008726145140826702, "learning_rate": 1e-06, "loss": 0.0635, "step": 854 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.32608427107334137, "epoch": 2.25, "grad_norm": 0.004246350843459368, "learning_rate": 1e-06, "loss": 0.103, "step": 855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.308328241109848, "epoch": 2.2526315789473683, "grad_norm": 0.005722150672227144, "learning_rate": 1e-06, "loss": 0.0967, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13585.0, "completions/mean_length": 1705.638671875, "completions/mean_terminated_length": 694.3945922851562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.29534509778022766, "epoch": 2.2552631578947366, "frac_reward_zero_std": 0.15625, "grad_norm": 196.59950256347656, "learning_rate": 1e-06, "loss": 0.1282, "num_tokens": 265932725.0, "reward": 0.7359683513641357, "reward_std": 0.2585596442222595, "rewards/progression_diversity/mean": -0.031095456331968307, "rewards/progression_diversity/std": 0.1090308129787445, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.87548828125, "rewards/symbolic_reward_partial_score/std": 0.2973478138446808, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0500712394714355, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 166.0, "sampling/sampling_logp_difference/mean": 1.9202924966812134, "step": 857 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.40625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.377376526594162, "epoch": 2.2578947368421054, "grad_norm": 0.00950684119015932, "learning_rate": 1e-06, "loss": 0.1836, "step": 858 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.4171202927827835, "epoch": 2.2605263157894737, "grad_norm": 41.98322296142578, "learning_rate": 1e-06, "loss": 0.2213, "step": 859 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3512898087501526, "epoch": 2.263157894736842, "grad_norm": 0.01232555229216814, "learning_rate": 1e-06, "loss": 0.1466, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11700.0, "completions/mean_length": 1474.6875, "completions/mean_terminated_length": 612.165283203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3273746818304062, "epoch": 2.2657894736842104, "frac_reward_zero_std": 0.1875, "grad_norm": 47.61751937866211, "learning_rate": 1e-06, "loss": 0.1721, "num_tokens": 267097749.0, "reward": 0.7948732972145081, "reward_std": 0.23872321844100952, "rewards/progression_diversity/mean": -0.02927570417523384, "rewards/progression_diversity/std": 0.11151636391878128, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9148763418197632, "rewards/symbolic_reward_partial_score/std": 0.253930926322937, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0462769269943237, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 157.0, "sampling/sampling_logp_difference/mean": 1.056786060333252, "step": 861 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.28751927614212036, "epoch": 2.268421052631579, "grad_norm": 0.012664435431361198, "learning_rate": 1e-06, "loss": 0.0962, "step": 862 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3128338009119034, "epoch": 2.2710526315789474, "grad_norm": 0.010503551922738552, "learning_rate": 1e-06, "loss": 0.1709, "step": 863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.299874484539032, "epoch": 2.2736842105263158, "grad_norm": 0.009202693589031696, "learning_rate": 1e-06, "loss": 0.1373, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12330.0, "completions/mean_length": 1326.27734375, "completions/mean_terminated_length": 618.0408935546875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.29694661498069763, "epoch": 2.276315789473684, "frac_reward_zero_std": 0.25, "grad_norm": 28.95979881286621, "learning_rate": 1e-06, "loss": 0.1895, "num_tokens": 268138083.0, "reward": 0.7938084602355957, "reward_std": 0.1921263337135315, "rewards/progression_diversity/mean": -0.02833309769630432, "rewards/progression_diversity/std": 0.11551596224308014, "rewards/symbolic_reward_accuracy/mean": 0.869140625, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.92041015625, "rewards/symbolic_reward_partial_score/std": 0.24347224831581116, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0430631637573242, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 157.90789794921875, "sampling/sampling_logp_difference/mean": 0.31643933057785034, "step": 865 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2519264966249466, "epoch": 2.2789473684210524, "grad_norm": 0.0065588075667619705, "learning_rate": 1e-06, "loss": 0.0924, "step": 866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.260475218296051, "epoch": 2.281578947368421, "grad_norm": 0.291818231344223, "learning_rate": 1e-06, "loss": 0.0611, "step": 867 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2639893591403961, "epoch": 2.2842105263157895, "grad_norm": 0.010040219873189926, "learning_rate": 1e-06, "loss": 0.0681, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10708.0, "completions/mean_length": 1395.240234375, "completions/mean_terminated_length": 560.814453125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.2564142793416977, "epoch": 2.286842105263158, "frac_reward_zero_std": 0.34375, "grad_norm": 28.421417236328125, "learning_rate": 1e-06, "loss": 0.2116, "num_tokens": 269248734.0, "reward": 0.7830079793930054, "reward_std": 0.18753179907798767, "rewards/progression_diversity/mean": -0.03417276591062546, "rewards/progression_diversity/std": 0.13364149630069733, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9034830331802368, "rewards/symbolic_reward_partial_score/std": 0.2679261863231659, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.034959316253662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 158.98016357421875, "sampling/sampling_logp_difference/mean": 0.3048059940338135, "step": 869 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2360822707414627, "epoch": 2.2894736842105265, "grad_norm": 0.009712324477732182, "learning_rate": 1e-06, "loss": 0.0756, "step": 870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2438521683216095, "epoch": 2.292105263157895, "grad_norm": 0.008457427844405174, "learning_rate": 1e-06, "loss": 0.0852, "step": 871 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23070208728313446, "epoch": 2.294736842105263, "grad_norm": 0.006932724732905626, "learning_rate": 1e-06, "loss": 0.1447, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10716.0, "completions/mean_length": 1633.05859375, "completions/mean_terminated_length": 583.8284301757812, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.24190223217010498, "epoch": 2.2973684210526315, "frac_reward_zero_std": 0.3125, "grad_norm": 5.179454326629639, "learning_rate": 1e-06, "loss": 0.0743, "num_tokens": 270490172.0, "reward": 0.7714011669158936, "reward_std": 0.20021489262580872, "rewards/progression_diversity/mean": -0.03762088716030121, "rewards/progression_diversity/std": 0.13623382151126862, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.9046223759651184, "rewards/symbolic_reward_partial_score/std": 0.26515042781829834, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0347652435302734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 162.99392700195312, "sampling/sampling_logp_difference/mean": 0.2914007902145386, "step": 873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.24344737827777863, "epoch": 2.3, "grad_norm": 3.351227045059204, "learning_rate": 1e-06, "loss": 0.1364, "step": 874 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2388121336698532, "epoch": 2.3026315789473686, "grad_norm": 0.011248363181948662, "learning_rate": 1e-06, "loss": 0.154, "step": 875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2308916077017784, "epoch": 2.305263157894737, "grad_norm": 0.24132992327213287, "learning_rate": 1e-06, "loss": 0.1312, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10811.0, "completions/mean_length": 1310.69921875, "completions/mean_terminated_length": 569.3892822265625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.24580485373735428, "epoch": 2.307894736842105, "frac_reward_zero_std": 0.3125, "grad_norm": 3.5337963104248047, "learning_rate": 1e-06, "loss": 0.0719, "num_tokens": 271567746.0, "reward": 0.7978794574737549, "reward_std": 0.2167576551437378, "rewards/progression_diversity/mean": -0.026513516902923584, "rewards/progression_diversity/std": 0.11184799671173096, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9156901240348816, "rewards/symbolic_reward_partial_score/std": 0.25561559200286865, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0415388345718384, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 161.8677520751953, "sampling/sampling_logp_difference/mean": 0.3057638108730316, "step": 877 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2451568767428398, "epoch": 2.3105263157894735, "grad_norm": 0.012724120169878006, "learning_rate": 1e-06, "loss": 0.0798, "step": 878 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24493427574634552, "epoch": 2.3131578947368423, "grad_norm": 0.018414990976452827, "learning_rate": 1e-06, "loss": 0.1669, "step": 879 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.24908612668514252, "epoch": 2.3157894736842106, "grad_norm": 0.00875798612833023, "learning_rate": 1e-06, "loss": 0.138, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11232.0, "completions/mean_length": 1375.77734375, "completions/mean_terminated_length": 572.8682861328125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2587193250656128, "epoch": 2.318421052631579, "frac_reward_zero_std": 0.28125, "grad_norm": 12.977412223815918, "learning_rate": 1e-06, "loss": 0.1116, "num_tokens": 272652592.0, "reward": 0.7711009979248047, "reward_std": 0.20819194614887238, "rewards/progression_diversity/mean": -0.02857367694377899, "rewards/progression_diversity/std": 0.11577697843313217, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.9085286259651184, "rewards/symbolic_reward_partial_score/std": 0.2529752552509308, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420448780059814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 166.0, "sampling/sampling_logp_difference/mean": 0.29961156845092773, "step": 881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26260629296302795, "epoch": 2.3210526315789473, "grad_norm": 0.06035393849015236, "learning_rate": 1e-06, "loss": 0.1662, "step": 882 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2572166621685028, "epoch": 2.3236842105263156, "grad_norm": 0.01509555708616972, "learning_rate": 1e-06, "loss": 0.0984, "step": 883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2562931329011917, "epoch": 2.3263157894736843, "grad_norm": 0.009722933173179626, "learning_rate": 1e-06, "loss": 0.0405, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11360.0, "completions/mean_length": 1769.9296875, "completions/mean_terminated_length": 531.4491577148438, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.2543962597846985, "epoch": 2.3289473684210527, "frac_reward_zero_std": 0.3125, "grad_norm": 8.712998390197754, "learning_rate": 1e-06, "loss": 0.1231, "num_tokens": 273958092.0, "reward": 0.7438181042671204, "reward_std": 0.2017257809638977, "rewards/progression_diversity/mean": -0.04201960563659668, "rewards/progression_diversity/std": 0.1431008279323578, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.8818359375, "rewards/symbolic_reward_partial_score/std": 0.29238346219062805, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041464924812317, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 171.0, "sampling/sampling_logp_difference/mean": 0.3230324387550354, "step": 885 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.24783600121736526, "epoch": 2.331578947368421, "grad_norm": 0.008411634713411331, "learning_rate": 1e-06, "loss": 0.0807, "step": 886 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24633554369211197, "epoch": 2.3342105263157893, "grad_norm": 0.013761989772319794, "learning_rate": 1e-06, "loss": 0.1864, "step": 887 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2572323977947235, "epoch": 2.336842105263158, "grad_norm": 0.011136609129607677, "learning_rate": 1e-06, "loss": 0.1732, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15127.0, "completions/mean_length": 1794.662109375, "completions/mean_terminated_length": 756.9267578125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.274614155292511, "epoch": 2.3394736842105264, "frac_reward_zero_std": 0.28125, "grad_norm": 30.201536178588867, "learning_rate": 1e-06, "loss": 0.1424, "num_tokens": 275266591.0, "reward": 0.7809537649154663, "reward_std": 0.18957683444023132, "rewards/progression_diversity/mean": -0.039390042424201965, "rewards/progression_diversity/std": 0.1323193460702896, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9039713740348816, "rewards/symbolic_reward_partial_score/std": 0.26959308981895447, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047221302986145, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 186.931884765625, "sampling/sampling_logp_difference/mean": 0.643196702003479, "step": 889 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2707149535417557, "epoch": 2.3421052631578947, "grad_norm": 0.010098773054778576, "learning_rate": 1e-06, "loss": 0.1054, "step": 890 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.28222164511680603, "epoch": 2.344736842105263, "grad_norm": 0.3068772554397583, "learning_rate": 1e-06, "loss": 0.1973, "step": 891 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.266815185546875, "epoch": 2.3473684210526318, "grad_norm": 1.143034815788269, "learning_rate": 1e-06, "loss": 0.126, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12367.0, "completions/mean_length": 1806.0625, "completions/mean_terminated_length": 570.64404296875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.2788756787776947, "epoch": 2.35, "frac_reward_zero_std": 0.25, "grad_norm": 49.109130859375, "learning_rate": 1e-06, "loss": 0.192, "num_tokens": 276565887.0, "reward": 0.7949280142784119, "reward_std": 0.20325177907943726, "rewards/progression_diversity/mean": -0.04333332180976868, "rewards/progression_diversity/std": 0.14543607831001282, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.91162109375, "rewards/symbolic_reward_partial_score/std": 0.2654019296169281, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0455005168914795, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 178.99993896484375, "sampling/sampling_logp_difference/mean": 0.7122431993484497, "step": 893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2602469027042389, "epoch": 2.3526315789473684, "grad_norm": 0.005726585630327463, "learning_rate": 1e-06, "loss": 0.1203, "step": 894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2623884826898575, "epoch": 2.3552631578947367, "grad_norm": 0.15292230248451233, "learning_rate": 1e-06, "loss": 0.0371, "step": 895 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2809968888759613, "epoch": 2.3578947368421055, "grad_norm": 0.005319634452462196, "learning_rate": 1e-06, "loss": 0.2592, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14861.0, "completions/mean_length": 2606.03515625, "completions/mean_terminated_length": 777.1017456054688, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.28651466965675354, "epoch": 2.360526315789474, "frac_reward_zero_std": 0.09375, "grad_norm": 24.261709213256836, "learning_rate": 1e-06, "loss": 0.1616, "num_tokens": 278317681.0, "reward": 0.7092536687850952, "reward_std": 0.2737279534339905, "rewards/progression_diversity/mean": -0.05608125776052475, "rewards/progression_diversity/std": 0.1531989425420761, "rewards/symbolic_reward_accuracy/mean": 0.78125, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.8426106572151184, "rewards/symbolic_reward_partial_score/std": 0.3354193866252899, "rewards/tag_count_reward/mean": -0.1171875, "rewards/tag_count_reward/std": 0.32195815443992615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045390248298645, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 180.99990844726562, "sampling/sampling_logp_difference/mean": 0.8015948534011841, "step": 897 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.30110684037208557, "epoch": 2.363157894736842, "grad_norm": 4.408512115478516, "learning_rate": 1e-06, "loss": 0.2523, "step": 898 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2703721821308136, "epoch": 2.3657894736842104, "grad_norm": 0.4832998216152191, "learning_rate": 1e-06, "loss": 0.1279, "step": 899 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.28015589714050293, "epoch": 2.3684210526315788, "grad_norm": 0.007808802183717489, "learning_rate": 1e-06, "loss": 0.1913, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12939.0, "completions/mean_length": 1535.99609375, "completions/mean_terminated_length": 805.766357421875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.2482433319091797, "epoch": 2.3710526315789475, "frac_reward_zero_std": 0.25, "grad_norm": 15.909863471984863, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 279514095.0, "reward": 0.775747537612915, "reward_std": 0.22888442873954773, "rewards/progression_diversity/mean": -0.032673317939043045, "rewards/progression_diversity/std": 0.12282172590494156, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9013671875, "rewards/symbolic_reward_partial_score/std": 0.26712751388549805, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0461652278900146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 189.7768096923828, "sampling/sampling_logp_difference/mean": 0.7810826301574707, "step": 901 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.275927871465683, "epoch": 2.373684210526316, "grad_norm": 0.009297163225710392, "learning_rate": 1e-06, "loss": 0.1966, "step": 902 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2581518739461899, "epoch": 2.376315789473684, "grad_norm": 0.01285830419510603, "learning_rate": 1e-06, "loss": 0.1598, "step": 903 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.27708950638771057, "epoch": 2.3789473684210525, "grad_norm": 0.01762818545103073, "learning_rate": 1e-06, "loss": 0.2154, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12266.0, "completions/mean_length": 1788.240234375, "completions/mean_terminated_length": 684.3592529296875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.2828632742166519, "epoch": 2.3815789473684212, "frac_reward_zero_std": 0.1875, "grad_norm": 28.789213180541992, "learning_rate": 1e-06, "loss": 0.1664, "num_tokens": 280833482.0, "reward": 0.7678719162940979, "reward_std": 0.19974274933338165, "rewards/progression_diversity/mean": -0.034103699028491974, "rewards/progression_diversity/std": 0.11745806038379669, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.9051106572151184, "rewards/symbolic_reward_partial_score/std": 0.2643766403198242, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0446805953979492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 187.5076446533203, "sampling/sampling_logp_difference/mean": 0.7177349328994751, "step": 905 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2618148922920227, "epoch": 2.3842105263157896, "grad_norm": 0.5371949672698975, "learning_rate": 1e-06, "loss": 0.1672, "step": 906 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.26834670454263687, "epoch": 2.386842105263158, "grad_norm": 0.008967792615294456, "learning_rate": 1e-06, "loss": 0.0927, "step": 907 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.2814731001853943, "epoch": 2.389473684210526, "grad_norm": 1.3082025051116943, "learning_rate": 1e-06, "loss": 0.1104, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11597.0, "completions/mean_length": 2223.818359375, "completions/mean_terminated_length": 826.0321655273438, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.26080475747585297, "epoch": 2.3921052631578945, "frac_reward_zero_std": 0.15625, "grad_norm": 56.598018646240234, "learning_rate": 1e-06, "loss": 0.1662, "num_tokens": 282405421.0, "reward": 0.7159743309020996, "reward_std": 0.26898401975631714, "rewards/progression_diversity/mean": -0.04807615280151367, "rewards/progression_diversity/std": 0.13962800800800323, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.8575846552848816, "rewards/symbolic_reward_partial_score/std": 0.31706979870796204, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0462749004364014, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 186.0, "sampling/sampling_logp_difference/mean": 0.7766126990318298, "step": 909 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.27692703902721405, "epoch": 2.3947368421052633, "grad_norm": 3.8725080490112305, "learning_rate": 1e-06, "loss": 0.1265, "step": 910 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2723271772265434, "epoch": 2.3973684210526316, "grad_norm": 3.388723611831665, "learning_rate": 1e-06, "loss": 0.1184, "step": 911 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2788117825984955, "epoch": 2.4, "grad_norm": 0.0120708541944623, "learning_rate": 1e-06, "loss": 0.2389, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11453.0, "completions/mean_length": 1910.248046875, "completions/mean_terminated_length": 616.8489379882812, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.2541096359491348, "epoch": 2.4026315789473682, "frac_reward_zero_std": 0.1875, "grad_norm": 0.020459089428186417, "learning_rate": 1e-06, "loss": 0.0879, "num_tokens": 283807468.0, "reward": 0.7370096445083618, "reward_std": 0.23636674880981445, "rewards/progression_diversity/mean": -0.03927619010210037, "rewards/progression_diversity/std": 0.13120803236961365, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.8798828125, "rewards/symbolic_reward_partial_score/std": 0.28891611099243164, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0478408336639404, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 186.0, "sampling/sampling_logp_difference/mean": 0.6119551658630371, "step": 913 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.28826163709163666, "epoch": 2.405263157894737, "grad_norm": 10.523284912109375, "learning_rate": 1e-06, "loss": 0.2367, "step": 914 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2593996897339821, "epoch": 2.4078947368421053, "grad_norm": 0.012641196139156818, "learning_rate": 1e-06, "loss": 0.1789, "step": 915 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2622472196817398, "epoch": 2.4105263157894736, "grad_norm": 0.012980838306248188, "learning_rate": 1e-06, "loss": 0.0986, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10840.0, "completions/mean_length": 1772.17578125, "completions/mean_terminated_length": 667.0798950195312, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2561643496155739, "epoch": 2.413157894736842, "frac_reward_zero_std": 0.25, "grad_norm": 34.25957489013672, "learning_rate": 1e-06, "loss": 0.0891, "num_tokens": 285117126.0, "reward": 0.7429047226905823, "reward_std": 0.22891083359718323, "rewards/progression_diversity/mean": -0.040586501359939575, "rewards/progression_diversity/std": 0.14015312492847443, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.8826497793197632, "rewards/symbolic_reward_partial_score/std": 0.28473344445228577, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0470426082611084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 185.0, "sampling/sampling_logp_difference/mean": 0.5546722412109375, "step": 917 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2671400457620621, "epoch": 2.4157894736842107, "grad_norm": 0.01011139526963234, "learning_rate": 1e-06, "loss": 0.1876, "step": 918 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.25694411993026733, "epoch": 2.418421052631579, "grad_norm": 0.011605838313698769, "learning_rate": 1e-06, "loss": 0.1389, "step": 919 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2727016508579254, "epoch": 2.4210526315789473, "grad_norm": 1.0605076551437378, "learning_rate": 1e-06, "loss": 0.167, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11618.0, "completions/mean_length": 1737.193359375, "completions/mean_terminated_length": 728.1231689453125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.26561206579208374, "epoch": 2.4236842105263157, "frac_reward_zero_std": 0.25, "grad_norm": 114.35115814208984, "learning_rate": 1e-06, "loss": 0.1868, "num_tokens": 286425737.0, "reward": 0.7702474594116211, "reward_std": 0.19327302277088165, "rewards/progression_diversity/mean": -0.03580557554960251, "rewards/progression_diversity/std": 0.12353134900331497, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.9000650644302368, "rewards/symbolic_reward_partial_score/std": 0.2719910442829132, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0514814853668213, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 187.99998474121094, "sampling/sampling_logp_difference/mean": 0.6879295110702515, "step": 921 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2657042294740677, "epoch": 2.4263157894736844, "grad_norm": 4.603257179260254, "learning_rate": 1e-06, "loss": 0.0912, "step": 922 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2845795154571533, "epoch": 2.4289473684210527, "grad_norm": 0.0070864069275557995, "learning_rate": 1e-06, "loss": 0.1715, "step": 923 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.27537743747234344, "epoch": 2.431578947368421, "grad_norm": 0.988322377204895, "learning_rate": 1e-06, "loss": 0.1607, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11161.0, "completions/mean_length": 1345.548828125, "completions/mean_terminated_length": 605.9528198242188, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.25172994285821915, "epoch": 2.4342105263157894, "frac_reward_zero_std": 0.4375, "grad_norm": 17.042478561401367, "learning_rate": 1e-06, "loss": 0.1133, "num_tokens": 287528290.0, "reward": 0.8133134841918945, "reward_std": 0.15900012850761414, "rewards/progression_diversity/mean": -0.030958127230405807, "rewards/progression_diversity/std": 0.12952972948551178, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9288737177848816, "rewards/symbolic_reward_partial_score/std": 0.23390844464302063, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050032615661621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 187.0, "sampling/sampling_logp_difference/mean": 0.4537726044654846, "step": 925 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25580619275569916, "epoch": 2.4368421052631577, "grad_norm": 0.008091673254966736, "learning_rate": 1e-06, "loss": 0.1041, "step": 926 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2634577751159668, "epoch": 2.4394736842105265, "grad_norm": 0.019041819497942924, "learning_rate": 1e-06, "loss": 0.1278, "step": 927 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2495371326804161, "epoch": 2.442105263157895, "grad_norm": 0.00901293195784092, "learning_rate": 1e-06, "loss": 0.1001, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10453.0, "completions/mean_length": 2109.220703125, "completions/mean_terminated_length": 598.4989013671875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.26799434423446655, "epoch": 2.444736842105263, "frac_reward_zero_std": 0.21875, "grad_norm": 14.625404357910156, "learning_rate": 1e-06, "loss": 0.1196, "num_tokens": 289010675.0, "reward": 0.7222642302513123, "reward_std": 0.21547727286815643, "rewards/progression_diversity/mean": -0.04896814748644829, "rewards/progression_diversity/std": 0.15030057728290558, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.8688151240348816, "rewards/symbolic_reward_partial_score/std": 0.29783180356025696, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0500848293304443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 184.99998474121094, "sampling/sampling_logp_difference/mean": 0.47443127632141113, "step": 929 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.26960813999176025, "epoch": 2.4473684210526314, "grad_norm": 1.1039235591888428, "learning_rate": 1e-06, "loss": 0.1318, "step": 930 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.26754334568977356, "epoch": 2.45, "grad_norm": 0.5074542164802551, "learning_rate": 1e-06, "loss": 0.126, "step": 931 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2775499224662781, "epoch": 2.4526315789473685, "grad_norm": 0.007839949801564217, "learning_rate": 1e-06, "loss": 0.1686, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11585.0, "completions/mean_length": 2094.462890625, "completions/mean_terminated_length": 683.90771484375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.2719879597425461, "epoch": 2.455263157894737, "frac_reward_zero_std": 0.125, "grad_norm": 22.145875930786133, "learning_rate": 1e-06, "loss": 0.1494, "num_tokens": 290511872.0, "reward": 0.7216647267341614, "reward_std": 0.2439390867948532, "rewards/progression_diversity/mean": -0.05032936483621597, "rewards/progression_diversity/std": 0.15467680990695953, "rewards/symbolic_reward_accuracy/mean": 0.78515625, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.8629556894302368, "rewards/symbolic_reward_partial_score/std": 0.29924580454826355, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0494955778121948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 185.0, "sampling/sampling_logp_difference/mean": 0.5632617473602295, "step": 933 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2774350792169571, "epoch": 2.457894736842105, "grad_norm": 9.17587947845459, "learning_rate": 1e-06, "loss": 0.1804, "step": 934 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2561372071504593, "epoch": 2.4605263157894735, "grad_norm": 0.012559008784592152, "learning_rate": 1e-06, "loss": 0.1994, "step": 935 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.27159862220287323, "epoch": 2.463157894736842, "grad_norm": 0.4640989601612091, "learning_rate": 1e-06, "loss": 0.1847, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9495.0, "completions/mean_length": 1562.392578125, "completions/mean_terminated_length": 508.1359558105469, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.24451758712530136, "epoch": 2.4657894736842105, "frac_reward_zero_std": 0.28125, "grad_norm": 7.657289505004883, "learning_rate": 1e-06, "loss": 0.1219, "num_tokens": 291718473.0, "reward": 0.7819766998291016, "reward_std": 0.22530367970466614, "rewards/progression_diversity/mean": -0.03475029766559601, "rewards/progression_diversity/std": 0.13393589854240417, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, "rewards/symbolic_reward_partial_score/std": 0.2685166001319885, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0499627590179443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 183.99996948242188, "sampling/sampling_logp_difference/mean": 0.395052433013916, "step": 937 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.26470696926116943, "epoch": 2.468421052631579, "grad_norm": 14.9205961227417, "learning_rate": 1e-06, "loss": 0.1575, "step": 938 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.25287581980228424, "epoch": 2.4710526315789476, "grad_norm": 0.30867093801498413, "learning_rate": 1e-06, "loss": 0.0946, "step": 939 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2676776349544525, "epoch": 2.473684210526316, "grad_norm": 0.011630654335021973, "learning_rate": 1e-06, "loss": 0.164, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11695.0, "completions/mean_length": 1605.857421875, "completions/mean_terminated_length": 620.64794921875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.2720135450363159, "epoch": 2.4763157894736842, "frac_reward_zero_std": 0.25, "grad_norm": 22.873489379882812, "learning_rate": 1e-06, "loss": 0.1462, "num_tokens": 292933632.0, "reward": 0.7649558186531067, "reward_std": 0.22175286710262299, "rewards/progression_diversity/mean": -0.04250683635473251, "rewards/progression_diversity/std": 0.15298283100128174, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.90283203125, "rewards/symbolic_reward_partial_score/std": 0.2671821117401123, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0548969507217407, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 186.0, "sampling/sampling_logp_difference/mean": 0.43102023005485535, "step": 941 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2624838799238205, "epoch": 2.4789473684210526, "grad_norm": 0.00974233727902174, "learning_rate": 1e-06, "loss": 0.0969, "step": 942 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2723797559738159, "epoch": 2.481578947368421, "grad_norm": 0.013580858707427979, "learning_rate": 1e-06, "loss": 0.1871, "step": 943 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.27662527561187744, "epoch": 2.4842105263157896, "grad_norm": 0.8762795329093933, "learning_rate": 1e-06, "loss": 0.1239, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12230.0, "completions/mean_length": 2296.412109375, "completions/mean_terminated_length": 669.74072265625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.2892232984304428, "epoch": 2.486842105263158, "frac_reward_zero_std": 0.125, "grad_norm": 16.07674217224121, "learning_rate": 1e-06, "loss": 0.3197, "num_tokens": 294500691.0, "reward": 0.721849799156189, "reward_std": 0.270133912563324, "rewards/progression_diversity/mean": -0.061112575232982635, "rewards/progression_diversity/std": 0.17030061781406403, "rewards/symbolic_reward_accuracy/mean": 0.79296875, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.8528646230697632, "rewards/symbolic_reward_partial_score/std": 0.327158123254776, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061535120010376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 182.99998474121094, "sampling/sampling_logp_difference/mean": 0.5773719549179077, "step": 945 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2695477306842804, "epoch": 2.4894736842105263, "grad_norm": 0.016710912808775902, "learning_rate": 1e-06, "loss": 0.1135, "step": 946 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2837250232696533, "epoch": 2.4921052631578946, "grad_norm": 2.493502616882324, "learning_rate": 1e-06, "loss": 0.1442, "step": 947 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.28005383908748627, "epoch": 2.4947368421052634, "grad_norm": 1.3186397552490234, "learning_rate": 1e-06, "loss": 0.2604, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12417.0, "completions/mean_length": 2161.0078125, "completions/mean_terminated_length": 621.7229614257812, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2822905331850052, "epoch": 2.4973684210526317, "frac_reward_zero_std": 0.28125, "grad_norm": 14.119881629943848, "learning_rate": 1e-06, "loss": 0.1365, "num_tokens": 296000247.0, "reward": 0.7538953423500061, "reward_std": 0.20081034302711487, "rewards/progression_diversity/mean": -0.04991994798183441, "rewards/progression_diversity/std": 0.15201660990715027, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, "rewards/symbolic_reward_partial_score/std": 0.3015502393245697, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0665756464004517, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 187.9999542236328, "sampling/sampling_logp_difference/mean": 0.5656195282936096, "step": 949 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.28284990787506104, "epoch": 2.5, "grad_norm": 2.7521109580993652, "learning_rate": 1e-06, "loss": 0.118, "step": 950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2816344350576401, "epoch": 2.5026315789473683, "grad_norm": 2.637901544570923, "learning_rate": 1e-06, "loss": 0.1765, "step": 951 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2915927320718765, "epoch": 2.5052631578947366, "grad_norm": 0.009813666343688965, "learning_rate": 1e-06, "loss": 0.1735, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10413.0, "completions/mean_length": 1986.9765625, "completions/mean_terminated_length": 565.8111572265625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.26933328807353973, "epoch": 2.5078947368421054, "frac_reward_zero_std": 0.3125, "grad_norm": 31.982301712036133, "learning_rate": 1e-06, "loss": 0.1037, "num_tokens": 297408875.0, "reward": 0.7207555770874023, "reward_std": 0.19293510913848877, "rewards/progression_diversity/mean": -0.04358455538749695, "rewards/progression_diversity/std": 0.14303399622440338, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.8675130605697632, "rewards/symbolic_reward_partial_score/std": 0.29670462012290955, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059614658355713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 190.99974060058594, "sampling/sampling_logp_difference/mean": 0.5123655200004578, "step": 953 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2840705066919327, "epoch": 2.5105263157894737, "grad_norm": 9.22028923034668, "learning_rate": 1e-06, "loss": 0.2415, "step": 954 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2771739959716797, "epoch": 2.513157894736842, "grad_norm": 1.641637921333313, "learning_rate": 1e-06, "loss": 0.1161, "step": 955 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.27851031720638275, "epoch": 2.515789473684211, "grad_norm": 2.4652910232543945, "learning_rate": 1e-06, "loss": 0.1288, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13128.0, "completions/mean_length": 1963.44921875, "completions/mean_terminated_length": 674.8042602539062, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.2786627262830734, "epoch": 2.518421052631579, "frac_reward_zero_std": 0.34375, "grad_norm": 18.040977478027344, "learning_rate": 1e-06, "loss": 0.0985, "num_tokens": 298796273.0, "reward": 0.7595969438552856, "reward_std": 0.18789124488830566, "rewards/progression_diversity/mean": -0.041288819164037704, "rewards/progression_diversity/std": 0.13274167478084564, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.88623046875, "rewards/symbolic_reward_partial_score/std": 0.29010042548179626, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061213493347168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 190.0, "sampling/sampling_logp_difference/mean": 0.4693317413330078, "step": 957 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2739100754261017, "epoch": 2.5210526315789474, "grad_norm": 0.009779985062777996, "learning_rate": 1e-06, "loss": 0.1521, "step": 958 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.295116662979126, "epoch": 2.5236842105263158, "grad_norm": 0.010075357742607594, "learning_rate": 1e-06, "loss": 0.1778, "step": 959 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.27153341472148895, "epoch": 2.526315789473684, "grad_norm": 0.01223987527191639, "learning_rate": 1e-06, "loss": 0.1236, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14244.0, "completions/mean_length": 2309.912109375, "completions/mean_terminated_length": 786.742431640625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.30197782814502716, "epoch": 2.5289473684210524, "frac_reward_zero_std": 0.15625, "grad_norm": 25.759937286376953, "learning_rate": 1e-06, "loss": 0.1054, "num_tokens": 300395268.0, "reward": 0.6984215378761292, "reward_std": 0.22170159220695496, "rewards/progression_diversity/mean": -0.0504293292760849, "rewards/progression_diversity/std": 0.14640994369983673, "rewards/symbolic_reward_accuracy/mean": 0.755859375, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.8512369394302368, "rewards/symbolic_reward_partial_score/std": 0.3122048079967499, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0820285081863403, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 189.9932861328125, "sampling/sampling_logp_difference/mean": 0.6296501159667969, "step": 961 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.31053292751312256, "epoch": 2.531578947368421, "grad_norm": 0.010956598445773125, "learning_rate": 1e-06, "loss": 0.197, "step": 962 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3268131613731384, "epoch": 2.5342105263157895, "grad_norm": 4.867886543273926, "learning_rate": 1e-06, "loss": 0.1752, "step": 963 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.29976852238178253, "epoch": 2.536842105263158, "grad_norm": 0.28594204783439636, "learning_rate": 1e-06, "loss": 0.1525, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14171.0, "completions/mean_length": 1403.908203125, "completions/mean_terminated_length": 667.1823120117188, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.3206019401550293, "epoch": 2.5394736842105265, "frac_reward_zero_std": 0.34375, "grad_norm": 21.9819393157959, "learning_rate": 1e-06, "loss": 0.1878, "num_tokens": 301511541.0, "reward": 0.8047443628311157, "reward_std": 0.16195279359817505, "rewards/progression_diversity/mean": -0.028497815132141113, "rewards/progression_diversity/std": 0.1169213354587555, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9347330331802368, "rewards/symbolic_reward_partial_score/std": 0.21818473935127258, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805866718292236, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 188.99998474121094, "sampling/sampling_logp_difference/mean": 0.5423262119293213, "step": 965 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.28471747040748596, "epoch": 2.542105263157895, "grad_norm": 0.39148885011672974, "learning_rate": 1e-06, "loss": 0.0897, "step": 966 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.27406057715415955, "epoch": 2.544736842105263, "grad_norm": 0.004555261693894863, "learning_rate": 1e-06, "loss": 0.0543, "step": 967 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.29411862790584564, "epoch": 2.5473684210526315, "grad_norm": 0.011435529217123985, "learning_rate": 1e-06, "loss": 0.1264, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13752.0, "completions/mean_length": 1564.11328125, "completions/mean_terminated_length": 641.7137451171875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.31351859867572784, "epoch": 2.55, "frac_reward_zero_std": 0.375, "grad_norm": 25.21331787109375, "learning_rate": 1e-06, "loss": 0.1539, "num_tokens": 302711503.0, "reward": 0.7893426418304443, "reward_std": 0.18001574277877808, "rewards/progression_diversity/mean": -0.03058181144297123, "rewards/progression_diversity/std": 0.11552340537309647, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9108072519302368, "rewards/symbolic_reward_partial_score/std": 0.25809013843536377, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0829228162765503, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 189.0, "sampling/sampling_logp_difference/mean": 0.5267312526702881, "step": 969 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2877921462059021, "epoch": 2.5526315789473686, "grad_norm": 0.15190419554710388, "learning_rate": 1e-06, "loss": 0.0471, "step": 970 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30551964044570923, "epoch": 2.555263157894737, "grad_norm": 0.005192040465772152, "learning_rate": 1e-06, "loss": 0.1395, "step": 971 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29702240228652954, "epoch": 2.557894736842105, "grad_norm": 0.016318701207637787, "learning_rate": 1e-06, "loss": 0.0983, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 11243.0, "completions/mean_length": 1504.9765625, "completions/mean_terminated_length": 578.8963012695312, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.27735981345176697, "epoch": 2.5605263157894735, "frac_reward_zero_std": 0.34375, "grad_norm": 5.76010274887085, "learning_rate": 1e-06, "loss": 0.0696, "num_tokens": 303875171.0, "reward": 0.7819315195083618, "reward_std": 0.1941755712032318, "rewards/progression_diversity/mean": -0.029511984437704086, "rewards/progression_diversity/std": 0.11651825904846191, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.908203125, "rewards/symbolic_reward_partial_score/std": 0.25990694761276245, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0842504501342773, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 192.0, "sampling/sampling_logp_difference/mean": 0.5295515060424805, "step": 973 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.33016660809516907, "epoch": 2.5631578947368423, "grad_norm": 0.0073794652707874775, "learning_rate": 1e-06, "loss": 0.201, "step": 974 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.287723571062088, "epoch": 2.5657894736842106, "grad_norm": 0.01062384806573391, "learning_rate": 1e-06, "loss": 0.139, "step": 975 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2862458676099777, "epoch": 2.568421052631579, "grad_norm": 0.01843968592584133, "learning_rate": 1e-06, "loss": 0.0968, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13225.0, "completions/mean_length": 1550.58203125, "completions/mean_terminated_length": 627.3402709960938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.30991949141025543, "epoch": 2.5710526315789473, "frac_reward_zero_std": 0.3125, "grad_norm": 35.655460357666016, "learning_rate": 1e-06, "loss": 0.1447, "num_tokens": 305055693.0, "reward": 0.7691850662231445, "reward_std": 0.20995613932609558, "rewards/progression_diversity/mean": -0.02974330447614193, "rewards/progression_diversity/std": 0.1124248057603836, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.8976237177848816, "rewards/symbolic_reward_partial_score/std": 0.27150553464889526, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0818887948989868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 199.03089904785156, "sampling/sampling_logp_difference/mean": 0.4938344359397888, "step": 977 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2864755541086197, "epoch": 2.5736842105263156, "grad_norm": 0.02253652736544609, "learning_rate": 1e-06, "loss": 0.0692, "step": 978 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.28972767293453217, "epoch": 2.5763157894736843, "grad_norm": 0.007194779813289642, "learning_rate": 1e-06, "loss": 0.1079, "step": 979 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30334651470184326, "epoch": 2.5789473684210527, "grad_norm": 0.007408217992633581, "learning_rate": 1e-06, "loss": 0.1557, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13332.0, "completions/mean_length": 1399.427734375, "completions/mean_terminated_length": 565.2350463867188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.2880914658308029, "epoch": 2.581578947368421, "frac_reward_zero_std": 0.3125, "grad_norm": 79.71167755126953, "learning_rate": 1e-06, "loss": 0.1494, "num_tokens": 306174184.0, "reward": 0.8039476275444031, "reward_std": 0.21047255396842957, "rewards/progression_diversity/mean": -0.025160329416394234, "rewards/progression_diversity/std": 0.10529939085245132, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9241536855697632, "rewards/symbolic_reward_partial_score/std": 0.23598654568195343, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0879991054534912, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 199.0, "sampling/sampling_logp_difference/mean": 0.5080787539482117, "step": 981 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2747099846601486, "epoch": 2.5842105263157897, "grad_norm": 0.09463394433259964, "learning_rate": 1e-06, "loss": 0.0628, "step": 982 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.29164865612983704, "epoch": 2.586842105263158, "grad_norm": 0.016119860112667084, "learning_rate": 1e-06, "loss": 0.1221, "step": 983 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3027169704437256, "epoch": 2.5894736842105264, "grad_norm": 0.01110443938523531, "learning_rate": 1e-06, "loss": 0.2168, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14648.0, "completions/mean_length": 1692.560546875, "completions/mean_terminated_length": 614.5723266601562, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.30789491534233093, "epoch": 2.5921052631578947, "frac_reward_zero_std": 0.1875, "grad_norm": 100.55760192871094, "learning_rate": 1e-06, "loss": 0.196, "num_tokens": 307458599.0, "reward": 0.7546229362487793, "reward_std": 0.2402782291173935, "rewards/progression_diversity/mean": -0.03575975075364113, "rewards/progression_diversity/std": 0.12710198760032654, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.8974609375, "rewards/symbolic_reward_partial_score/std": 0.26585423946380615, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0941245555877686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 200.0, "sampling/sampling_logp_difference/mean": 0.5682839751243591, "step": 985 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3354620784521103, "epoch": 2.594736842105263, "grad_norm": 0.8639695644378662, "learning_rate": 1e-06, "loss": 0.2717, "step": 986 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.27841542661190033, "epoch": 2.5973684210526313, "grad_norm": 0.023379117250442505, "learning_rate": 1e-06, "loss": 0.105, "step": 987 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2825475037097931, "epoch": 2.6, "grad_norm": 0.06907982379198074, "learning_rate": 1e-06, "loss": 0.0687, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13308.0, "completions/mean_length": 1544.58203125, "completions/mean_terminated_length": 588.1954345703125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3159325271844864, "epoch": 2.6026315789473684, "frac_reward_zero_std": 0.375, "grad_norm": 10.981948852539062, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 308619185.0, "reward": 0.7951602935791016, "reward_std": 0.1814492791891098, "rewards/progression_diversity/mean": -0.03475700318813324, "rewards/progression_diversity/std": 0.13058389723300934, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9205728769302368, "rewards/symbolic_reward_partial_score/std": 0.24516622722148895, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.087169885635376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 207.0, "sampling/sampling_logp_difference/mean": 0.5363781452178955, "step": 989 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.292761892080307, "epoch": 2.6052631578947367, "grad_norm": 0.7690657377243042, "learning_rate": 1e-06, "loss": 0.0832, "step": 990 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30266323685646057, "epoch": 2.6078947368421055, "grad_norm": 0.011311556212604046, "learning_rate": 1e-06, "loss": 0.1176, "step": 991 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30019472539424896, "epoch": 2.610526315789474, "grad_norm": 0.01206042617559433, "learning_rate": 1e-06, "loss": 0.1639, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14076.0, "completions/mean_length": 1671.501953125, "completions/mean_terminated_length": 657.9060668945312, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.3192198872566223, "epoch": 2.613157894736842, "frac_reward_zero_std": 0.25, "grad_norm": 54.62965393066406, "learning_rate": 1e-06, "loss": 0.2534, "num_tokens": 309873970.0, "reward": 0.763299822807312, "reward_std": 0.21089032292366028, "rewards/progression_diversity/mean": -0.04208873584866524, "rewards/progression_diversity/std": 0.14815421402454376, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.9012044072151184, "rewards/symbolic_reward_partial_score/std": 0.26881706714630127, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1019631624221802, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 211.0, "sampling/sampling_logp_difference/mean": 0.8787015676498413, "step": 993 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3022729456424713, "epoch": 2.6157894736842104, "grad_norm": 0.012045558542013168, "learning_rate": 1e-06, "loss": 0.1114, "step": 994 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.298090398311615, "epoch": 2.6184210526315788, "grad_norm": 0.007539027836173773, "learning_rate": 1e-06, "loss": 0.1346, "step": 995 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2837032973766327, "epoch": 2.6210526315789475, "grad_norm": 0.5139919519424438, "learning_rate": 1e-06, "loss": 0.1695, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13815.0, "completions/mean_length": 2134.458984375, "completions/mean_terminated_length": 626.408203125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.28592349588871, "epoch": 2.623684210526316, "frac_reward_zero_std": 0.125, "grad_norm": 19.418561935424805, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 311386141.0, "reward": 0.7048972845077515, "reward_std": 0.2682965099811554, "rewards/progression_diversity/mean": -0.05226554721593857, "rewards/progression_diversity/std": 0.15609407424926758, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.8663736581802368, "rewards/symbolic_reward_partial_score/std": 0.2934032678604126, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.119614601135254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 214.0, "sampling/sampling_logp_difference/mean": 1.162545919418335, "step": 997 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.32562409341335297, "epoch": 2.626315789473684, "grad_norm": 0.011620646342635155, "learning_rate": 1e-06, "loss": 0.2374, "step": 998 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3074010759592056, "epoch": 2.6289473684210525, "grad_norm": 33.348323822021484, "learning_rate": 1e-06, "loss": 0.1771, "step": 999 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.31934235990047455, "epoch": 2.6315789473684212, "grad_norm": 0.009514860808849335, "learning_rate": 1e-06, "loss": 0.315, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13780.0, "completions/mean_length": 2376.53125, "completions/mean_terminated_length": 690.7308349609375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.30572909116744995, "epoch": 2.6342105263157896, "frac_reward_zero_std": 0.1875, "grad_norm": 94.71639251708984, "learning_rate": 1e-06, "loss": 0.2104, "num_tokens": 312988045.0, "reward": 0.7043373584747314, "reward_std": 0.23734444379806519, "rewards/progression_diversity/mean": -0.05454763025045395, "rewards/progression_diversity/std": 0.15527451038360596, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.8483072519302368, "rewards/symbolic_reward_partial_score/std": 0.3243545591831207, "rewards/tag_count_reward/mean": -0.11328125, "rewards/tag_count_reward/std": 0.3172462284564972, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1103978157043457, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 215.0, "sampling/sampling_logp_difference/mean": 0.8874210119247437, "step": 1001 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.29766781628131866, "epoch": 2.636842105263158, "grad_norm": 1.0701483488082886, "learning_rate": 1e-06, "loss": 0.1461, "step": 1002 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29889827966690063, "epoch": 2.639473684210526, "grad_norm": 0.014587471261620522, "learning_rate": 1e-06, "loss": 0.1481, "step": 1003 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.33510279655456543, "epoch": 2.6421052631578945, "grad_norm": 0.6961768269538879, "learning_rate": 1e-06, "loss": 0.2699, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13484.0, "completions/mean_length": 1625.13671875, "completions/mean_terminated_length": 575.3430786132812, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.29443803429603577, "epoch": 2.6447368421052633, "frac_reward_zero_std": 0.21875, "grad_norm": 39.419071197509766, "learning_rate": 1e-06, "loss": 0.1312, "num_tokens": 314233267.0, "reward": 0.7752651572227478, "reward_std": 0.22436900436878204, "rewards/progression_diversity/mean": -0.0369657427072525, "rewards/progression_diversity/std": 0.13464143872261047, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.91162109375, "rewards/symbolic_reward_partial_score/std": 0.2517907917499542, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1153823137283325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 214.0, "sampling/sampling_logp_difference/mean": 0.7468029260635376, "step": 1005 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31393955647945404, "epoch": 2.6473684210526316, "grad_norm": 49.80437469482422, "learning_rate": 1e-06, "loss": 0.2599, "step": 1006 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2889316976070404, "epoch": 2.65, "grad_norm": 2.7875256538391113, "learning_rate": 1e-06, "loss": 0.0751, "step": 1007 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.305808961391449, "epoch": 2.6526315789473687, "grad_norm": 3.0765891075134277, "learning_rate": 1e-06, "loss": 0.2263, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14044.0, "completions/mean_length": 1598.720703125, "completions/mean_terminated_length": 678.4751586914062, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.30450139939785004, "epoch": 2.655263157894737, "frac_reward_zero_std": 0.125, "grad_norm": 31.50531578063965, "learning_rate": 1e-06, "loss": 0.1628, "num_tokens": 315446116.0, "reward": 0.7807782292366028, "reward_std": 0.25463420152664185, "rewards/progression_diversity/mean": -0.037412114441394806, "rewards/progression_diversity/std": 0.1364642083644867, "rewards/symbolic_reward_accuracy/mean": 0.86328125, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.8994140625, "rewards/symbolic_reward_partial_score/std": 0.2765096426010132, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1178048849105835, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 216.0, "sampling/sampling_logp_difference/mean": 0.7162982225418091, "step": 1009 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.28954601287841797, "epoch": 2.6578947368421053, "grad_norm": 10.927779197692871, "learning_rate": 1e-06, "loss": 0.1193, "step": 1010 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.33044156432151794, "epoch": 2.6605263157894736, "grad_norm": 0.021338341757655144, "learning_rate": 1e-06, "loss": 0.3381, "step": 1011 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.287820965051651, "epoch": 2.663157894736842, "grad_norm": 0.009080777876079082, "learning_rate": 1e-06, "loss": 0.1125, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13928.0, "completions/mean_length": 1759.529296875, "completions/mean_terminated_length": 620.3599853515625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.3139393925666809, "epoch": 2.6657894736842103, "frac_reward_zero_std": 0.28125, "grad_norm": 21.068523406982422, "learning_rate": 1e-06, "loss": 0.1538, "num_tokens": 316735123.0, "reward": 0.7530990839004517, "reward_std": 0.20721742510795593, "rewards/progression_diversity/mean": -0.036777839064598083, "rewards/progression_diversity/std": 0.12973876297473907, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.88720703125, "rewards/symbolic_reward_partial_score/std": 0.28543245792388916, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1145870685577393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 218.0, "sampling/sampling_logp_difference/mean": 0.6381403803825378, "step": 1013 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2958460748195648, "epoch": 2.668421052631579, "grad_norm": 0.33650216460227966, "learning_rate": 1e-06, "loss": 0.0614, "step": 1014 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3110879957675934, "epoch": 2.6710526315789473, "grad_norm": 0.006965796463191509, "learning_rate": 1e-06, "loss": 0.1327, "step": 1015 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31926093995571136, "epoch": 2.6736842105263157, "grad_norm": 0.009219714440405369, "learning_rate": 1e-06, "loss": 0.17, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15009.0, "completions/mean_length": 1719.177734375, "completions/mean_terminated_length": 676.0731811523438, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.3213798254728317, "epoch": 2.6763157894736844, "frac_reward_zero_std": 0.09375, "grad_norm": 36.93714904785156, "learning_rate": 1e-06, "loss": 0.1665, "num_tokens": 317986126.0, "reward": 0.7775270938873291, "reward_std": 0.24769052863121033, "rewards/progression_diversity/mean": -0.03537972271442413, "rewards/progression_diversity/std": 0.1272072046995163, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.90673828125, "rewards/symbolic_reward_partial_score/std": 0.263979971408844, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1412655115127563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 219.0, "sampling/sampling_logp_difference/mean": 0.8786369562149048, "step": 1017 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3090389519929886, "epoch": 2.6789473684210527, "grad_norm": 69.12992095947266, "learning_rate": 1e-06, "loss": 0.1427, "step": 1018 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.333890438079834, "epoch": 2.681578947368421, "grad_norm": 0.026694627478718758, "learning_rate": 1e-06, "loss": 0.2929, "step": 1019 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.316163033246994, "epoch": 2.6842105263157894, "grad_norm": 0.014393487945199013, "learning_rate": 1e-06, "loss": 0.1923, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15286.0, "completions/mean_length": 3114.5078125, "completions/mean_terminated_length": 729.668212890625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.36883601546287537, "epoch": 2.6868421052631577, "frac_reward_zero_std": 0.125, "grad_norm": 56.34549331665039, "learning_rate": 1e-06, "loss": 0.3005, "num_tokens": 320001746.0, "reward": 0.7100275158882141, "reward_std": 0.24902528524398804, "rewards/progression_diversity/mean": -0.07147189974784851, "rewards/progression_diversity/std": 0.16751477122306824, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.84375, "rewards/symbolic_reward_partial_score/std": 0.33665984869003296, "rewards/tag_count_reward/mean": -0.158203125, "rewards/tag_count_reward/std": 0.36528825759887695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1642587184906006, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 220.0, "sampling/sampling_logp_difference/mean": 1.0265074968338013, "step": 1021 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.33186276257038116, "epoch": 2.6894736842105265, "grad_norm": 2.4869983196258545, "learning_rate": 1e-06, "loss": 0.2048, "step": 1022 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3606336712837219, "epoch": 2.692105263157895, "grad_norm": 1.9981337785720825, "learning_rate": 1e-06, "loss": 0.2636, "step": 1023 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.35093724727630615, "epoch": 2.694736842105263, "grad_norm": 1.4287171363830566, "learning_rate": 1e-06, "loss": 0.2628, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14680.0, "completions/mean_length": 2566.037109375, "completions/mean_terminated_length": 731.7942504882812, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.3709093779325485, "epoch": 2.6973684210526314, "frac_reward_zero_std": 0.09375, "grad_norm": 38.17184829711914, "learning_rate": 1e-06, "loss": 0.4175, "num_tokens": 321699173.0, "reward": 0.709370493888855, "reward_std": 0.2614933252334595, "rewards/progression_diversity/mean": -0.059050630778074265, "rewards/progression_diversity/std": 0.1578987091779709, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.83984375, "rewards/symbolic_reward_partial_score/std": 0.3304028809070587, "rewards/tag_count_reward/mean": -0.119140625, "rewards/tag_count_reward/std": 0.32427072525024414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1661169528961182, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 220.0, "sampling/sampling_logp_difference/mean": 1.1539242267608643, "step": 1025 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.40625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.3150624781847, "epoch": 2.7, "grad_norm": 1.1932648420333862, "learning_rate": 1e-06, "loss": 0.1208, "step": 1026 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.35171347856521606, "epoch": 2.7026315789473685, "grad_norm": 0.0064244456589221954, "learning_rate": 1e-06, "loss": 0.3091, "step": 1027 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.32932211458683014, "epoch": 2.705263157894737, "grad_norm": 0.20103150606155396, "learning_rate": 1e-06, "loss": 0.1512, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14449.0, "completions/mean_length": 2274.8984375, "completions/mean_terminated_length": 714.021728515625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.3005029112100601, "epoch": 2.707894736842105, "frac_reward_zero_std": 0.1875, "grad_norm": 63.158145904541016, "learning_rate": 1e-06, "loss": 0.1651, "num_tokens": 323267889.0, "reward": 0.7657883167266846, "reward_std": 0.2607175409793854, "rewards/progression_diversity/mean": -0.05203522741794586, "rewards/progression_diversity/std": 0.15054935216903687, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.8870442509651184, "rewards/symbolic_reward_partial_score/std": 0.29508423805236816, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.148714303970337, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 221.0, "sampling/sampling_logp_difference/mean": 1.0465000867843628, "step": 1029 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3231324404478073, "epoch": 2.7105263157894735, "grad_norm": 0.05354348570108414, "learning_rate": 1e-06, "loss": 0.175, "step": 1030 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3530917465686798, "epoch": 2.713157894736842, "grad_norm": 0.014254847541451454, "learning_rate": 1e-06, "loss": 0.2848, "step": 1031 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3636767566204071, "epoch": 2.7157894736842105, "grad_norm": 0.34099018573760986, "learning_rate": 1e-06, "loss": 0.3258, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15208.0, "completions/mean_length": 3406.103515625, "completions/mean_terminated_length": 859.0396728515625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.3606864959001541, "epoch": 2.718421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 110.6368637084961, "learning_rate": 1e-06, "loss": 0.2805, "num_tokens": 325423462.0, "reward": 0.6178954243659973, "reward_std": 0.3361561894416809, "rewards/progression_diversity/mean": -0.08546093106269836, "rewards/progression_diversity/std": 0.18625648319721222, "rewards/symbolic_reward_accuracy/mean": 0.666015625, "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, "rewards/symbolic_reward_partial_score/mean": 0.7877604365348816, "rewards/symbolic_reward_partial_score/std": 0.3652539551258087, "rewards/tag_count_reward/mean": -0.171875, "rewards/tag_count_reward/std": 0.3776407241821289, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1710354089736938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 224.0, "sampling/sampling_logp_difference/mean": 1.457003116607666, "step": 1033 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.36586907505989075, "epoch": 2.7210526315789476, "grad_norm": 47.91674041748047, "learning_rate": 1e-06, "loss": 0.3584, "step": 1034 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.35853834450244904, "epoch": 2.723684210526316, "grad_norm": 1.4703402519226074, "learning_rate": 1e-06, "loss": 0.2924, "step": 1035 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3385375142097473, "epoch": 2.7263157894736842, "grad_norm": 10.245598793029785, "learning_rate": 1e-06, "loss": 0.1748, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10898.0, "completions/mean_length": 2200.62109375, "completions/mean_terminated_length": 597.2825927734375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.30844029784202576, "epoch": 2.7289473684210526, "frac_reward_zero_std": 0.21875, "grad_norm": 21.34285545349121, "learning_rate": 1e-06, "loss": 0.0902, "num_tokens": 326975684.0, "reward": 0.7785071730613708, "reward_std": 0.2188257873058319, "rewards/progression_diversity/mean": -0.05456160753965378, "rewards/progression_diversity/std": 0.16060733795166016, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9073892831802368, "rewards/symbolic_reward_partial_score/std": 0.2701605260372162, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1437357664108276, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 223.0, "sampling/sampling_logp_difference/mean": 1.1044782400131226, "step": 1037 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.35305924713611603, "epoch": 2.731578947368421, "grad_norm": 2.7072434425354004, "learning_rate": 1e-06, "loss": 0.3355, "step": 1038 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.31338661909103394, "epoch": 2.734210526315789, "grad_norm": 16.57544708251953, "learning_rate": 1e-06, "loss": 0.1466, "step": 1039 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3223544955253601, "epoch": 2.736842105263158, "grad_norm": 0.008024279028177261, "learning_rate": 1e-06, "loss": 0.2434, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14168.0, "completions/mean_length": 2421.57421875, "completions/mean_terminated_length": 637.8281860351562, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.3547728508710861, "epoch": 2.7394736842105263, "frac_reward_zero_std": 0.09375, "grad_norm": 86.12596130371094, "learning_rate": 1e-06, "loss": 0.3437, "num_tokens": 328613322.0, "reward": 0.7703942060470581, "reward_std": 0.2505676746368408, "rewards/progression_diversity/mean": -0.06019480898976326, "rewards/progression_diversity/std": 0.1668085753917694, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.9000651240348816, "rewards/symbolic_reward_partial_score/std": 0.27447789907455444, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.165305256843567, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 227.0, "sampling/sampling_logp_difference/mean": 1.1906356811523438, "step": 1041 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4296875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3167977035045624, "epoch": 2.7421052631578946, "grad_norm": 6.371710300445557, "learning_rate": 1e-06, "loss": 0.1303, "step": 1042 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.3315960764884949, "epoch": 2.7447368421052634, "grad_norm": 22.913915634155273, "learning_rate": 1e-06, "loss": 0.2824, "step": 1043 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3117201626300812, "epoch": 2.7473684210526317, "grad_norm": 5.115964412689209, "learning_rate": 1e-06, "loss": 0.253, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13730.0, "completions/mean_length": 2225.83984375, "completions/mean_terminated_length": 659.5358276367188, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.31773585081100464, "epoch": 2.75, "frac_reward_zero_std": 0.1875, "grad_norm": 119.56202697753906, "learning_rate": 1e-06, "loss": 0.1282, "num_tokens": 330152088.0, "reward": 0.742752194404602, "reward_std": 0.24866357445716858, "rewards/progression_diversity/mean": -0.055839426815509796, "rewards/progression_diversity/std": 0.16208583116531372, "rewards/symbolic_reward_accuracy/mean": 0.814453125, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.88134765625, "rewards/symbolic_reward_partial_score/std": 0.2882753908634186, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1512091159820557, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 230.999267578125, "sampling/sampling_logp_difference/mean": 1.0916484594345093, "step": 1045 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3315199911594391, "epoch": 2.7526315789473683, "grad_norm": 85.15950012207031, "learning_rate": 1e-06, "loss": 0.2759, "step": 1046 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3466692268848419, "epoch": 2.7552631578947366, "grad_norm": 0.012678657658398151, "learning_rate": 1e-06, "loss": 0.2945, "step": 1047 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.31701667606830597, "epoch": 2.7578947368421054, "grad_norm": 0.01987772062420845, "learning_rate": 1e-06, "loss": 0.1537, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13047.0, "completions/mean_length": 2842.43359375, "completions/mean_terminated_length": 626.5408935546875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.32595211267471313, "epoch": 2.7605263157894737, "frac_reward_zero_std": 0.125, "grad_norm": 42.63386535644531, "learning_rate": 1e-06, "loss": 0.1182, "num_tokens": 332023574.0, "reward": 0.6774349212646484, "reward_std": 0.2632233202457428, "rewards/progression_diversity/mean": -0.06901118159294128, "rewards/progression_diversity/std": 0.17166905105113983, "rewards/symbolic_reward_accuracy/mean": 0.734375, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.8352864980697632, "rewards/symbolic_reward_partial_score/std": 0.32469478249549866, "rewards/tag_count_reward/mean": -0.130859375, "rewards/tag_count_reward/std": 0.33757632970809937, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.155900239944458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 234.0, "sampling/sampling_logp_difference/mean": 1.1260725259780884, "step": 1049 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.3452259302139282, "epoch": 2.763157894736842, "grad_norm": 5.842362403869629, "learning_rate": 1e-06, "loss": 0.1744, "step": 1050 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.34844136238098145, "epoch": 2.765789473684211, "grad_norm": 14.326484680175781, "learning_rate": 1e-06, "loss": 0.2425, "step": 1051 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.35987676680088043, "epoch": 2.768421052631579, "grad_norm": 30.41329002380371, "learning_rate": 1e-06, "loss": 0.2773, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12739.0, "completions/mean_length": 2201.59375, "completions/mean_terminated_length": 666.7012939453125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.32392917573451996, "epoch": 2.7710526315789474, "frac_reward_zero_std": 0.25, "grad_norm": 95.98092651367188, "learning_rate": 1e-06, "loss": 0.1847, "num_tokens": 333554758.0, "reward": 0.7248282432556152, "reward_std": 0.2060919851064682, "rewards/progression_diversity/mean": -0.04647231847047806, "rewards/progression_diversity/std": 0.1398431360721588, "rewards/symbolic_reward_accuracy/mean": 0.791015625, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.8675130009651184, "rewards/symbolic_reward_partial_score/std": 0.2977105677127838, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1349936723709106, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 236.0, "sampling/sampling_logp_difference/mean": 0.8933988809585571, "step": 1053 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.33513136208057404, "epoch": 2.7736842105263158, "grad_norm": 0.05125413089990616, "learning_rate": 1e-06, "loss": 0.139, "step": 1054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32300393283367157, "epoch": 2.776315789473684, "grad_norm": 2.2907490730285645, "learning_rate": 1e-06, "loss": 0.1541, "step": 1055 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.32895462214946747, "epoch": 2.7789473684210524, "grad_norm": 0.22027532756328583, "learning_rate": 1e-06, "loss": 0.1606, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12516.0, "completions/mean_length": 1867.125, "completions/mean_terminated_length": 536.1535034179688, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.32684555649757385, "epoch": 2.781578947368421, "frac_reward_zero_std": 0.3125, "grad_norm": 27.86194610595703, "learning_rate": 1e-06, "loss": 0.1246, "num_tokens": 334916038.0, "reward": 0.7695657014846802, "reward_std": 0.18517985939979553, "rewards/progression_diversity/mean": -0.040504228323698044, "rewards/progression_diversity/std": 0.13616561889648438, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.89990234375, "rewards/symbolic_reward_partial_score/std": 0.2672683298587799, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1254425048828125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 236.0, "sampling/sampling_logp_difference/mean": 0.741882860660553, "step": 1057 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3313567191362381, "epoch": 2.7842105263157895, "grad_norm": 6.761070728302002, "learning_rate": 1e-06, "loss": 0.2295, "step": 1058 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3324154317378998, "epoch": 2.786842105263158, "grad_norm": 0.011156097054481506, "learning_rate": 1e-06, "loss": 0.0708, "step": 1059 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33027806878089905, "epoch": 2.7894736842105265, "grad_norm": 0.16176286339759827, "learning_rate": 1e-06, "loss": 0.1187, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15293.0, "completions/mean_length": 2273.78125, "completions/mean_terminated_length": 678.7130126953125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3395443707704544, "epoch": 2.792105263157895, "frac_reward_zero_std": 0.21875, "grad_norm": 84.5785140991211, "learning_rate": 1e-06, "loss": 0.3383, "num_tokens": 336512054.0, "reward": 0.7558015584945679, "reward_std": 0.23675483465194702, "rewards/progression_diversity/mean": -0.05461447685956955, "rewards/progression_diversity/std": 0.15894843637943268, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.8889973759651184, "rewards/symbolic_reward_partial_score/std": 0.28549298644065857, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1387587785720825, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 236.0, "sampling/sampling_logp_difference/mean": 0.9233871698379517, "step": 1061 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3283067047595978, "epoch": 2.794736842105263, "grad_norm": 13.06532096862793, "learning_rate": 1e-06, "loss": 0.2576, "step": 1062 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3241392523050308, "epoch": 2.7973684210526315, "grad_norm": 1.6975409984588623, "learning_rate": 1e-06, "loss": 0.0366, "step": 1063 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.31890368461608887, "epoch": 2.8, "grad_norm": 10.018250465393066, "learning_rate": 1e-06, "loss": 0.1212, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14169.0, "completions/mean_length": 2099.845703125, "completions/mean_terminated_length": 622.174560546875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3201150894165039, "epoch": 2.8026315789473686, "frac_reward_zero_std": 0.125, "grad_norm": 316.5431823730469, "learning_rate": 1e-06, "loss": 0.188, "num_tokens": 338001831.0, "reward": 0.7660241723060608, "reward_std": 0.2371438443660736, "rewards/progression_diversity/mean": -0.052859190851449966, "rewards/progression_diversity/std": 0.16034673154354095, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.8904622793197632, "rewards/symbolic_reward_partial_score/std": 0.28877806663513184, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1355504989624023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 236.0, "sampling/sampling_logp_difference/mean": 0.9679932594299316, "step": 1065 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.32754945755004883, "epoch": 2.805263157894737, "grad_norm": 55.01374053955078, "learning_rate": 1e-06, "loss": 0.155, "step": 1066 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.33194398880004883, "epoch": 2.807894736842105, "grad_norm": 0.01747787557542324, "learning_rate": 1e-06, "loss": 0.2416, "step": 1067 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.33147796988487244, "epoch": 2.8105263157894735, "grad_norm": 0.019125865772366524, "learning_rate": 1e-06, "loss": 0.2642, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14000.0, "completions/mean_length": 2323.65625, "completions/mean_terminated_length": 596.9473876953125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.33723948895931244, "epoch": 2.8131578947368423, "frac_reward_zero_std": 0.1875, "grad_norm": 55.272857666015625, "learning_rate": 1e-06, "loss": 0.2617, "num_tokens": 339580151.0, "reward": 0.7574684619903564, "reward_std": 0.24873128533363342, "rewards/progression_diversity/mean": -0.06370329856872559, "rewards/progression_diversity/std": 0.176786869764328, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.8811849355697632, "rewards/symbolic_reward_partial_score/std": 0.2997874319553375, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1518162488937378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 235.0, "sampling/sampling_logp_difference/mean": 1.1923803091049194, "step": 1069 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3266005963087082, "epoch": 2.8157894736842106, "grad_norm": 14.835233688354492, "learning_rate": 1e-06, "loss": 0.1812, "step": 1070 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3344879746437073, "epoch": 2.818421052631579, "grad_norm": 20.65220832824707, "learning_rate": 1e-06, "loss": 0.2772, "step": 1071 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.32967130839824677, "epoch": 2.8210526315789473, "grad_norm": 15.066210746765137, "learning_rate": 1e-06, "loss": 0.2274, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13483.0, "completions/mean_length": 2695.2265625, "completions/mean_terminated_length": 739.6875610351562, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.3279756009578705, "epoch": 2.8236842105263156, "frac_reward_zero_std": 0.125, "grad_norm": 82.50452423095703, "learning_rate": 1e-06, "loss": 0.1288, "num_tokens": 341365259.0, "reward": 0.6973080039024353, "reward_std": 0.2565169632434845, "rewards/progression_diversity/mean": -0.06900518387556076, "rewards/progression_diversity/std": 0.174638569355011, "rewards/symbolic_reward_accuracy/mean": 0.763671875, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.8409830331802368, "rewards/symbolic_reward_partial_score/std": 0.327880322933197, "rewards/tag_count_reward/mean": -0.125, "rewards/tag_count_reward/std": 0.3310423493385315, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.146012783050537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 238.0, "sampling/sampling_logp_difference/mean": 1.2247239351272583, "step": 1073 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3268892467021942, "epoch": 2.8263157894736843, "grad_norm": 0.03915620595216751, "learning_rate": 1e-06, "loss": 0.3393, "step": 1074 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3162257820367813, "epoch": 2.8289473684210527, "grad_norm": 1.8628556728363037, "learning_rate": 1e-06, "loss": 0.1444, "step": 1075 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.32084639370441437, "epoch": 2.831578947368421, "grad_norm": 0.034546032547950745, "learning_rate": 1e-06, "loss": 0.2549, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13901.0, "completions/mean_length": 2555.38671875, "completions/mean_terminated_length": 684.997802734375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.31973057985305786, "epoch": 2.8342105263157897, "frac_reward_zero_std": 0.09375, "grad_norm": 80.3057861328125, "learning_rate": 1e-06, "loss": 0.1383, "num_tokens": 343049393.0, "reward": 0.7412656545639038, "reward_std": 0.25144657492637634, "rewards/progression_diversity/mean": -0.06289247423410416, "rewards/progression_diversity/std": 0.16770640015602112, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.8714193105697632, "rewards/symbolic_reward_partial_score/std": 0.30579882860183716, "rewards/tag_count_reward/mean": -0.1171875, "rewards/tag_count_reward/std": 0.32195815443992615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1399236917495728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 239.99998474121094, "sampling/sampling_logp_difference/mean": 1.2581195831298828, "step": 1077 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.31444528698921204, "epoch": 2.836842105263158, "grad_norm": 55.245845794677734, "learning_rate": 1e-06, "loss": 0.2088, "step": 1078 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4921875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.32562339305877686, "epoch": 2.8394736842105264, "grad_norm": 34.23236846923828, "learning_rate": 1e-06, "loss": 0.2752, "step": 1079 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.40625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.32775624096393585, "epoch": 2.8421052631578947, "grad_norm": 23.81574821472168, "learning_rate": 1e-06, "loss": 0.2996, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9691.0, "completions/mean_length": 2100.416015625, "completions/mean_terminated_length": 554.5736083984375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3156034052371979, "epoch": 2.844736842105263, "frac_reward_zero_std": 0.21875, "grad_norm": 43.40895462036133, "learning_rate": 1e-06, "loss": 0.1877, "num_tokens": 344549286.0, "reward": 0.7650686502456665, "reward_std": 0.24320955574512482, "rewards/progression_diversity/mean": -0.05075865983963013, "rewards/progression_diversity/std": 0.15617826581001282, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.8904622793197632, "rewards/symbolic_reward_partial_score/std": 0.2910281717777252, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1255333423614502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 240.0, "sampling/sampling_logp_difference/mean": 1.064498782157898, "step": 1081 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30958323180675507, "epoch": 2.8473684210526313, "grad_norm": 9.998337745666504, "learning_rate": 1e-06, "loss": 0.1545, "step": 1082 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3141885995864868, "epoch": 2.85, "grad_norm": 0.007370346691459417, "learning_rate": 1e-06, "loss": 0.2394, "step": 1083 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3183276653289795, "epoch": 2.8526315789473684, "grad_norm": 0.008199164643883705, "learning_rate": 1e-06, "loss": 0.2117, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13934.0, "completions/mean_length": 2409.056640625, "completions/mean_terminated_length": 623.71142578125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.323482483625412, "epoch": 2.8552631578947367, "frac_reward_zero_std": 0.1875, "grad_norm": 53.07914352416992, "learning_rate": 1e-06, "loss": 0.1864, "num_tokens": 346174179.0, "reward": 0.7465149164199829, "reward_std": 0.27112072706222534, "rewards/progression_diversity/mean": -0.06531209498643875, "rewards/progression_diversity/std": 0.1783425658941269, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.8785806894302368, "rewards/symbolic_reward_partial_score/std": 0.3004186153411865, "rewards/tag_count_reward/mean": -0.109375, "rewards/tag_count_reward/std": 0.31241437792778015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1350785493850708, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 240.0, "sampling/sampling_logp_difference/mean": 1.2513222694396973, "step": 1085 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.32216334342956543, "epoch": 2.8578947368421055, "grad_norm": 3.4325926303863525, "learning_rate": 1e-06, "loss": 0.2676, "step": 1086 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3240419775247574, "epoch": 2.860526315789474, "grad_norm": 15.382451057434082, "learning_rate": 1e-06, "loss": 0.2478, "step": 1087 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.32489949464797974, "epoch": 2.863157894736842, "grad_norm": 0.019483868032693863, "learning_rate": 1e-06, "loss": 0.1895, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12305.0, "completions/mean_length": 2428.5703125, "completions/mean_terminated_length": 645.718017578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3152479976415634, "epoch": 2.8657894736842104, "frac_reward_zero_std": 0.21875, "grad_norm": 148.0259246826172, "learning_rate": 1e-06, "loss": 0.1724, "num_tokens": 347824423.0, "reward": 0.7290340662002563, "reward_std": 0.232526957988739, "rewards/progression_diversity/mean": -0.0604589506983757, "rewards/progression_diversity/std": 0.16615836322307587, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.8663736581802368, "rewards/symbolic_reward_partial_score/std": 0.31190311908721924, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.127872109413147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 240.0, "sampling/sampling_logp_difference/mean": 1.2021994590759277, "step": 1089 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3240260183811188, "epoch": 2.8684210526315788, "grad_norm": 14.310028076171875, "learning_rate": 1e-06, "loss": 0.2462, "step": 1090 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31351564824581146, "epoch": 2.8710526315789475, "grad_norm": 0.017242752015590668, "learning_rate": 1e-06, "loss": 0.0796, "step": 1091 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.3323921114206314, "epoch": 2.873684210526316, "grad_norm": 0.0029714410193264484, "learning_rate": 1e-06, "loss": 0.3207, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16382.0, "completions/mean_length": 2427.0859375, "completions/mean_terminated_length": 747.3698120117188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.31317219138145447, "epoch": 2.876315789473684, "frac_reward_zero_std": 0.1875, "grad_norm": 36.501434326171875, "learning_rate": 1e-06, "loss": 0.2128, "num_tokens": 349474067.0, "reward": 0.7331509590148926, "reward_std": 0.24093718826770782, "rewards/progression_diversity/mean": -0.05892288684844971, "rewards/progression_diversity/std": 0.1655765026807785, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.86767578125, "rewards/symbolic_reward_partial_score/std": 0.31132596731185913, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.153733253479004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 241.0, "sampling/sampling_logp_difference/mean": 1.242756724357605, "step": 1093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3204523026943207, "epoch": 2.8789473684210525, "grad_norm": 3.31962513923645, "learning_rate": 1e-06, "loss": 0.2336, "step": 1094 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.31440404057502747, "epoch": 2.8815789473684212, "grad_norm": 7.084986686706543, "learning_rate": 1e-06, "loss": 0.1256, "step": 1095 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.32728545367717743, "epoch": 2.8842105263157896, "grad_norm": 0.10335738956928253, "learning_rate": 1e-06, "loss": 0.2388, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.142578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16244.0, "completions/mean_length": 2942.568359375, "completions/mean_terminated_length": 707.432861328125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.32194942235946655, "epoch": 2.886842105263158, "frac_reward_zero_std": 0.1875, "grad_norm": 62.69070053100586, "learning_rate": 1e-06, "loss": 0.2512, "num_tokens": 351364438.0, "reward": 0.6914088726043701, "reward_std": 0.22169052064418793, "rewards/progression_diversity/mean": -0.07298420369625092, "rewards/progression_diversity/std": 0.1761486977338791, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.8409830331802368, "rewards/symbolic_reward_partial_score/std": 0.3356630504131317, "rewards/tag_count_reward/mean": -0.1484375, "rewards/tag_count_reward/std": 0.35588082671165466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.162922978401184, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 243.0, "sampling/sampling_logp_difference/mean": 1.4567382335662842, "step": 1097 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.32985997200012207, "epoch": 2.889473684210526, "grad_norm": 2.9098451137542725, "learning_rate": 1e-06, "loss": 0.2758, "step": 1098 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3266884684562683, "epoch": 2.8921052631578945, "grad_norm": 0.03355651721358299, "learning_rate": 1e-06, "loss": 0.1963, "step": 1099 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.32841236889362335, "epoch": 2.8947368421052633, "grad_norm": 6.091196537017822, "learning_rate": 1e-06, "loss": 0.2498, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16313.0, "completions/mean_length": 2224.619140625, "completions/mean_terminated_length": 589.657958984375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.3271956741809845, "epoch": 2.8973684210526316, "frac_reward_zero_std": 0.25, "grad_norm": 43.073707580566406, "learning_rate": 1e-06, "loss": 0.1519, "num_tokens": 352894899.0, "reward": 0.7742513418197632, "reward_std": 0.20133471488952637, "rewards/progression_diversity/mean": -0.05045757442712784, "rewards/progression_diversity/std": 0.14818480610847473, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.89697265625, "rewards/symbolic_reward_partial_score/std": 0.2875053882598877, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1496270895004272, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 246.0, "sampling/sampling_logp_difference/mean": 1.6051921844482422, "step": 1101 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3244181126356125, "epoch": 2.9, "grad_norm": 0.012723026797175407, "learning_rate": 1e-06, "loss": 0.2311, "step": 1102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.33277395367622375, "epoch": 2.9026315789473687, "grad_norm": 2.2688472270965576, "learning_rate": 1e-06, "loss": 0.2433, "step": 1103 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.33416812121868134, "epoch": 2.905263157894737, "grad_norm": 5.079883098602295, "learning_rate": 1e-06, "loss": 0.1993, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12811.0, "completions/mean_length": 1826.296875, "completions/mean_terminated_length": 559.0658569335938, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.3188180774450302, "epoch": 2.9078947368421053, "frac_reward_zero_std": 0.21875, "grad_norm": 23.28694725036621, "learning_rate": 1e-06, "loss": 0.1536, "num_tokens": 354244779.0, "reward": 0.7635840177536011, "reward_std": 0.22885069251060486, "rewards/progression_diversity/mean": -0.038090769201517105, "rewards/progression_diversity/std": 0.12966662645339966, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, "rewards/symbolic_reward_partial_score/std": 0.2578800618648529, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1475815773010254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 248.0, "sampling/sampling_logp_difference/mean": 1.869348168373108, "step": 1105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3273351788520813, "epoch": 2.9105263157894736, "grad_norm": 4.7783918380737305, "learning_rate": 1e-06, "loss": 0.1048, "step": 1106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3305172771215439, "epoch": 2.913157894736842, "grad_norm": 0.007633809465914965, "learning_rate": 1e-06, "loss": 0.2638, "step": 1107 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.333538293838501, "epoch": 2.9157894736842103, "grad_norm": 0.015457144938409328, "learning_rate": 1e-06, "loss": 0.2158, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16183.0, "completions/mean_length": 2176.537109375, "completions/mean_terminated_length": 604.7787475585938, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3226548731327057, "epoch": 2.918421052631579, "frac_reward_zero_std": 0.15625, "grad_norm": 108.32244873046875, "learning_rate": 1e-06, "loss": 0.1848, "num_tokens": 355747614.0, "reward": 0.7507563233375549, "reward_std": 0.25011780858039856, "rewards/progression_diversity/mean": -0.04643935710191727, "rewards/progression_diversity/std": 0.1406102478504181, "rewards/symbolic_reward_accuracy/mean": 0.826171875, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.8816731572151184, "rewards/symbolic_reward_partial_score/std": 0.2946558892726898, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1501786708831787, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 252.0, "sampling/sampling_logp_difference/mean": 2.1973791122436523, "step": 1109 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.32274968922138214, "epoch": 2.9210526315789473, "grad_norm": 0.011330176144838333, "learning_rate": 1e-06, "loss": 0.3078, "step": 1110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.31783410906791687, "epoch": 2.9236842105263157, "grad_norm": 0.008597995154559612, "learning_rate": 1e-06, "loss": 0.1984, "step": 1111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3165570944547653, "epoch": 2.9263157894736844, "grad_norm": 0.3951316773891449, "learning_rate": 1e-06, "loss": 0.2277, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11247.0, "completions/mean_length": 2216.048828125, "completions/mean_terminated_length": 614.454345703125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3109191954135895, "epoch": 2.9289473684210527, "frac_reward_zero_std": 0.15625, "grad_norm": 183.47438049316406, "learning_rate": 1e-06, "loss": 0.3146, "num_tokens": 357277879.0, "reward": 0.769425630569458, "reward_std": 0.23327568173408508, "rewards/progression_diversity/mean": -0.04474321007728577, "rewards/progression_diversity/std": 0.13221436738967896, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.8937174081802368, "rewards/symbolic_reward_partial_score/std": 0.2835972309112549, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1350001096725464, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 254.0, "sampling/sampling_logp_difference/mean": 2.6123390197753906, "step": 1113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.32664044201374054, "epoch": 2.931578947368421, "grad_norm": 84.57691192626953, "learning_rate": 1e-06, "loss": 0.2191, "step": 1114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.31761452555656433, "epoch": 2.9342105263157894, "grad_norm": 0.06620530039072037, "learning_rate": 1e-06, "loss": 0.1672, "step": 1115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3253130316734314, "epoch": 2.9368421052631577, "grad_norm": 0.013269034214317799, "learning_rate": 1e-06, "loss": 0.1481, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9632.0, "completions/mean_length": 2862.21875, "completions/mean_terminated_length": 577.7168579101562, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.33379143476486206, "epoch": 2.9394736842105265, "frac_reward_zero_std": 0.0625, "grad_norm": 188.9947052001953, "learning_rate": 1e-06, "loss": 0.2634, "num_tokens": 359162663.0, "reward": 0.7037363648414612, "reward_std": 0.26258164644241333, "rewards/progression_diversity/mean": -0.05605129897594452, "rewards/progression_diversity/std": 0.14197811484336853, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.8522135019302368, "rewards/symbolic_reward_partial_score/std": 0.3169857859611511, "rewards/tag_count_reward/mean": -0.130859375, "rewards/tag_count_reward/std": 0.33757632970809937, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1356170177459717, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 258.0, "sampling/sampling_logp_difference/mean": 3.2635445594787598, "step": 1117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.323417529463768, "epoch": 2.942105263157895, "grad_norm": 13.859256744384766, "learning_rate": 1e-06, "loss": 0.2917, "step": 1118 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3256514072418213, "epoch": 2.944736842105263, "grad_norm": 0.9161269664764404, "learning_rate": 1e-06, "loss": 0.2028, "step": 1119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.3238846957683563, "epoch": 2.9473684210526314, "grad_norm": 2.7128119468688965, "learning_rate": 1e-06, "loss": 0.1348, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16291.0, "completions/mean_length": 2118.107421875, "completions/mean_terminated_length": 642.325439453125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3269234597682953, "epoch": 2.95, "frac_reward_zero_std": 0.15625, "grad_norm": 149.16152954101562, "learning_rate": 1e-06, "loss": 0.1463, "num_tokens": 360662974.0, "reward": 0.7428156137466431, "reward_std": 0.2521783113479614, "rewards/progression_diversity/mean": -0.039731502532958984, "rewards/progression_diversity/std": 0.12264791131019592, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.8855794072151184, "rewards/symbolic_reward_partial_score/std": 0.2812291979789734, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1260616779327393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 260.0, "sampling/sampling_logp_difference/mean": 2.9290082454681396, "step": 1121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3308548033237457, "epoch": 2.9526315789473685, "grad_norm": 105.0246353149414, "learning_rate": 1e-06, "loss": 0.2037, "step": 1122 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3392486423254013, "epoch": 2.955263157894737, "grad_norm": 1.2266844511032104, "learning_rate": 1e-06, "loss": 0.2545, "step": 1123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.33492106199264526, "epoch": 2.957894736842105, "grad_norm": 0.017569491639733315, "learning_rate": 1e-06, "loss": 0.2375, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16326.0, "completions/mean_length": 2407.78125, "completions/mean_terminated_length": 656.914306640625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.3292433023452759, "epoch": 2.9605263157894735, "frac_reward_zero_std": 0.3125, "grad_norm": 124.67893981933594, "learning_rate": 1e-06, "loss": 0.1289, "num_tokens": 362307246.0, "reward": 0.7468899488449097, "reward_std": 0.2119377851486206, "rewards/progression_diversity/mean": -0.047334060072898865, "rewards/progression_diversity/std": 0.13447138667106628, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8844400644302368, "rewards/symbolic_reward_partial_score/std": 0.29540929198265076, "rewards/tag_count_reward/mean": -0.11328125, "rewards/tag_count_reward/std": 0.3172462284564972, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.115929365158081, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 261.95855712890625, "sampling/sampling_logp_difference/mean": 2.85267972946167, "step": 1125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.33551347255706787, "epoch": 2.963157894736842, "grad_norm": 46.468929290771484, "learning_rate": 1e-06, "loss": 0.2668, "step": 1126 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3421187102794647, "epoch": 2.9657894736842105, "grad_norm": 0.010265377350151539, "learning_rate": 1e-06, "loss": 0.1918, "step": 1127 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.33866679668426514, "epoch": 2.968421052631579, "grad_norm": 0.012180095538496971, "learning_rate": 1e-06, "loss": 0.2088, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12667.0, "completions/mean_length": 2409.67578125, "completions/mean_terminated_length": 589.6203002929688, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.32998014986515045, "epoch": 2.9710526315789476, "frac_reward_zero_std": 0.1875, "grad_norm": 134.6818084716797, "learning_rate": 1e-06, "loss": 0.2747, "num_tokens": 363944808.0, "reward": 0.7039549350738525, "reward_std": 0.259724497795105, "rewards/progression_diversity/mean": -0.04884590953588486, "rewards/progression_diversity/std": 0.13787634670734406, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.8474935293197632, "rewards/symbolic_reward_partial_score/std": 0.3241187334060669, "rewards/tag_count_reward/mean": -0.115234375, "rewards/tag_count_reward/std": 0.3196168541908264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1256601810455322, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 262.0, "sampling/sampling_logp_difference/mean": 3.3490161895751953, "step": 1129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3265266716480255, "epoch": 2.973684210526316, "grad_norm": 0.2271856814622879, "learning_rate": 1e-06, "loss": 0.2083, "step": 1130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3257727771997452, "epoch": 2.9763157894736842, "grad_norm": 0.01293912809342146, "learning_rate": 1e-06, "loss": 0.2484, "step": 1131 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.32893073558807373, "epoch": 2.9789473684210526, "grad_norm": 0.01243510190397501, "learning_rate": 1e-06, "loss": 0.1611, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16267.0, "completions/mean_length": 2278.9140625, "completions/mean_terminated_length": 684.426025390625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.3159499317407608, "epoch": 2.981578947368421, "frac_reward_zero_std": 0.03125, "grad_norm": 262.58709716796875, "learning_rate": 1e-06, "loss": 0.2098, "num_tokens": 365549628.0, "reward": 0.6847292184829712, "reward_std": 0.29013562202453613, "rewards/progression_diversity/mean": -0.042704228311777115, "rewards/progression_diversity/std": 0.12792158126831055, "rewards/symbolic_reward_accuracy/mean": 0.736328125, "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, "rewards/symbolic_reward_partial_score/mean": 0.8444010019302368, "rewards/symbolic_reward_partial_score/std": 0.31147703528404236, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1283180713653564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 264.0, "sampling/sampling_logp_difference/mean": 3.4847636222839355, "step": 1133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.3305753022432327, "epoch": 2.984210526315789, "grad_norm": 2.827713966369629, "learning_rate": 1e-06, "loss": 0.2373, "step": 1134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.3246810734272003, "epoch": 2.986842105263158, "grad_norm": 30.311552047729492, "learning_rate": 1e-06, "loss": 0.199, "step": 1135 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.32095228135585785, "epoch": 2.9894736842105263, "grad_norm": 0.029631303623318672, "learning_rate": 1e-06, "loss": 0.2114, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16215.0, "completions/mean_length": 1764.580078125, "completions/mean_terminated_length": 658.90966796875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3265077769756317, "epoch": 2.9921052631578946, "frac_reward_zero_std": 0.4375, "grad_norm": 68.62413787841797, "learning_rate": 1e-06, "loss": 0.136, "num_tokens": 366860069.0, "reward": 0.7820947170257568, "reward_std": 0.12925215065479279, "rewards/progression_diversity/mean": -0.02784038335084915, "rewards/progression_diversity/std": 0.09940054267644882, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.91845703125, "rewards/symbolic_reward_partial_score/std": 0.24726058542728424, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.105454683303833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 264.0, "sampling/sampling_logp_difference/mean": 2.1433823108673096, "step": 1137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.33374376595020294, "epoch": 2.9947368421052634, "grad_norm": 35.08319854736328, "learning_rate": 1e-06, "loss": 0.202, "step": 1138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32793696224689484, "epoch": 2.9973684210526317, "grad_norm": 5.97829532623291, "learning_rate": 1e-06, "loss": 0.1058, "step": 1139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3303612768650055, "epoch": 3.0, "grad_norm": 61.562416076660156, "learning_rate": 1e-06, "loss": 0.1287, "step": 1140 }, { "epoch": 3.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.04638671875, "eval_completions/max_length": 16384.0, "eval_completions/max_terminated_length": 5637.46875, "eval_completions/mean_length": 1238.202880859375, "eval_completions/mean_terminated_length": 502.11154556274414, "eval_completions/min_length": 207.375, "eval_completions/min_terminated_length": 207.375, "eval_entropy": 0.33187594451010227, "eval_frac_reward_zero_std": 0.28515625, "eval_loss": 0.05398595333099365, "eval_num_tokens": 366860069.0, "eval_reward": 0.7990773729979992, "eval_reward_std": 0.21094764932058752, "eval_rewards/progression_diversity/mean": -0.020244480154360645, "eval_rewards/progression_diversity/std": 0.08825050009181723, "eval_rewards/symbolic_reward_accuracy/mean": 0.87890625, "eval_rewards/symbolic_reward_accuracy/std": 0.3186751971952617, "eval_rewards/symbolic_reward_partial_score/mean": 0.9210205078125, "eval_rewards/symbolic_reward_partial_score/std": 0.22956605115905404, "eval_rewards/tag_count_reward/mean": -0.043701171875, "eval_rewards/tag_count_reward/std": 0.19665716541931033, "eval_runtime": 4075.6424, "eval_samples_per_second": 0.061, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.084825448691845, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 265.8116159439087, "eval_sampling/sampling_logp_difference/mean": 1.002108539454639, "eval_steps_per_second": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 2117.115234375, "completions/mean_terminated_length": 538.7830810546875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.3300452083349228, "epoch": 3.0026315789473683, "frac_reward_zero_std": 0.3125, "grad_norm": 52.12168884277344, "learning_rate": 1e-06, "loss": 0.0937, "num_tokens": 368347840.0, "reward": 0.7367914915084839, "reward_std": 0.1933838427066803, "rewards/progression_diversity/mean": -0.03667115420103073, "rewards/progression_diversity/std": 0.11381202191114426, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.87451171875, "rewards/symbolic_reward_partial_score/std": 0.3029172122478485, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1042859554290771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 266.0, "sampling/sampling_logp_difference/mean": 2.27083158493042, "step": 1141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.32495447993278503, "epoch": 3.0052631578947366, "grad_norm": 1.0844265222549438, "learning_rate": 1e-06, "loss": 0.1562, "step": 1142 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.31977418065071106, "epoch": 3.0078947368421054, "grad_norm": 32.22896194458008, "learning_rate": 1e-06, "loss": 0.0852, "step": 1143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.33107973635196686, "epoch": 3.0105263157894737, "grad_norm": 0.01271702442318201, "learning_rate": 1e-06, "loss": 0.183, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 1620.275390625, "completions/mean_terminated_length": 536.9832153320312, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3337739109992981, "epoch": 3.013157894736842, "frac_reward_zero_std": 0.28125, "grad_norm": 256.0187683105469, "learning_rate": 1e-06, "loss": 0.1703, "num_tokens": 369584397.0, "reward": 0.7931051254272461, "reward_std": 0.1887580156326294, "rewards/progression_diversity/mean": -0.02543444186449051, "rewards/progression_diversity/std": 0.09666049480438232, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.923828125, "rewards/symbolic_reward_partial_score/std": 0.23379817605018616, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.108473300933838, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 268.0, "sampling/sampling_logp_difference/mean": 2.444417953491211, "step": 1145 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.33417925238609314, "epoch": 3.0157894736842104, "grad_norm": 152.89532470703125, "learning_rate": 1e-06, "loss": 0.2239, "step": 1146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3302992135286331, "epoch": 3.018421052631579, "grad_norm": 2.4071531295776367, "learning_rate": 1e-06, "loss": 0.1218, "step": 1147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.32197025418281555, "epoch": 3.0210526315789474, "grad_norm": 0.013167927972972393, "learning_rate": 1e-06, "loss": 0.0594, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13552.0, "completions/mean_length": 1762.455078125, "completions/mean_terminated_length": 590.263671875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.31848709285259247, "epoch": 3.0236842105263158, "frac_reward_zero_std": 0.25, "grad_norm": 167.57388305664062, "learning_rate": 1e-06, "loss": 0.0925, "num_tokens": 370898422.0, "reward": 0.775731086730957, "reward_std": 0.21161219477653503, "rewards/progression_diversity/mean": -0.029432490468025208, "rewards/progression_diversity/std": 0.1058623269200325, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.8986002802848816, "rewards/symbolic_reward_partial_score/std": 0.2755959630012512, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1008307933807373, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 268.0, "sampling/sampling_logp_difference/mean": 2.5271763801574707, "step": 1149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.32427777349948883, "epoch": 3.026315789473684, "grad_norm": 0.09036588668823242, "learning_rate": 1e-06, "loss": 0.1532, "step": 1150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.32039374113082886, "epoch": 3.028947368421053, "grad_norm": 0.03532138466835022, "learning_rate": 1e-06, "loss": 0.1982, "step": 1151 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3186514228582382, "epoch": 3.031578947368421, "grad_norm": 0.011541608721017838, "learning_rate": 1e-06, "loss": 0.1511, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9474.0, "completions/mean_length": 1170.345703125, "completions/mean_terminated_length": 551.9044799804688, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.3213903158903122, "epoch": 3.0342105263157895, "frac_reward_zero_std": 0.4375, "grad_norm": 25.063764572143555, "learning_rate": 1e-06, "loss": 0.0644, "num_tokens": 371893767.0, "reward": 0.8350409269332886, "reward_std": 0.14898480474948883, "rewards/progression_diversity/mean": -0.016419440507888794, "rewards/progression_diversity/std": 0.08188439160585403, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9454752802848816, "rewards/symbolic_reward_partial_score/std": 0.20989501476287842, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.088158369064331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 268.0, "sampling/sampling_logp_difference/mean": 1.5961803197860718, "step": 1153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3240705579519272, "epoch": 3.036842105263158, "grad_norm": 0.00562731409445405, "learning_rate": 1e-06, "loss": 0.1014, "step": 1154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32319276034832, "epoch": 3.039473684210526, "grad_norm": 0.006894012447446585, "learning_rate": 1e-06, "loss": 0.097, "step": 1155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3254219591617584, "epoch": 3.042105263157895, "grad_norm": 0.0085447384044528, "learning_rate": 1e-06, "loss": 0.0831, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14955.0, "completions/mean_length": 1684.576171875, "completions/mean_terminated_length": 606.0020751953125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3363809883594513, "epoch": 3.044736842105263, "frac_reward_zero_std": 0.28125, "grad_norm": 185.30889892578125, "learning_rate": 1e-06, "loss": 0.1307, "num_tokens": 373167598.0, "reward": 0.7592610120773315, "reward_std": 0.2298196256160736, "rewards/progression_diversity/mean": -0.026055842638015747, "rewards/progression_diversity/std": 0.09842479228973389, "rewards/symbolic_reward_accuracy/mean": 0.830078125, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.89306640625, "rewards/symbolic_reward_partial_score/std": 0.2722471356391907, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0948313474655151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 270.0, "sampling/sampling_logp_difference/mean": 2.451362133026123, "step": 1157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.32949453592300415, "epoch": 3.0473684210526315, "grad_norm": 0.15845933556556702, "learning_rate": 1e-06, "loss": 0.15, "step": 1158 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3209853768348694, "epoch": 3.05, "grad_norm": 0.012255529873073101, "learning_rate": 1e-06, "loss": 0.1257, "step": 1159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3235623836517334, "epoch": 3.0526315789473686, "grad_norm": 0.0065957182087004185, "learning_rate": 1e-06, "loss": 0.1606, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10149.0, "completions/mean_length": 1787.939453125, "completions/mean_terminated_length": 584.458740234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3329770565032959, "epoch": 3.055263157894737, "frac_reward_zero_std": 0.3125, "grad_norm": 219.4588165283203, "learning_rate": 1e-06, "loss": 0.1499, "num_tokens": 374477807.0, "reward": 0.7548931837081909, "reward_std": 0.1553860455751419, "rewards/progression_diversity/mean": -0.02826322615146637, "rewards/progression_diversity/std": 0.10057384520769119, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8968098759651184, "rewards/symbolic_reward_partial_score/std": 0.26359865069389343, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0940297842025757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 272.0, "sampling/sampling_logp_difference/mean": 2.350238084793091, "step": 1161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3281650245189667, "epoch": 3.057894736842105, "grad_norm": 0.007984149269759655, "learning_rate": 1e-06, "loss": 0.2052, "step": 1162 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3284861296415329, "epoch": 3.0605263157894735, "grad_norm": 0.035218384116888046, "learning_rate": 1e-06, "loss": 0.12, "step": 1163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3274737000465393, "epoch": 3.0631578947368423, "grad_norm": 0.012610013596713543, "learning_rate": 1e-06, "loss": 0.1512, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1523.06640625, "completions/mean_terminated_length": 532.3375244140625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.3336295783519745, "epoch": 3.0657894736842106, "frac_reward_zero_std": 0.4375, "grad_norm": 120.76913452148438, "learning_rate": 1e-06, "loss": 0.1477, "num_tokens": 375667281.0, "reward": 0.8087124824523926, "reward_std": 0.15946154296398163, "rewards/progression_diversity/mean": -0.022307991981506348, "rewards/progression_diversity/std": 0.08901267498731613, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.93212890625, "rewards/symbolic_reward_partial_score/std": 0.231436625123024, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0914645195007324, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 272.0, "sampling/sampling_logp_difference/mean": 1.7476840019226074, "step": 1165 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.32854387164115906, "epoch": 3.068421052631579, "grad_norm": 0.015112695284187794, "learning_rate": 1e-06, "loss": 0.1242, "step": 1166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3354027718305588, "epoch": 3.0710526315789473, "grad_norm": 0.6051515340805054, "learning_rate": 1e-06, "loss": 0.0242, "step": 1167 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3323533236980438, "epoch": 3.0736842105263156, "grad_norm": 0.01162874884903431, "learning_rate": 1e-06, "loss": 0.2038, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1519.060546875, "completions/mean_terminated_length": 528.0646362304688, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.34026047587394714, "epoch": 3.0763157894736843, "frac_reward_zero_std": 0.34375, "grad_norm": 21.67325210571289, "learning_rate": 1e-06, "loss": 0.0595, "num_tokens": 376820816.0, "reward": 0.8038376569747925, "reward_std": 0.16369682550430298, "rewards/progression_diversity/mean": -0.021513454616069794, "rewards/progression_diversity/std": 0.08785874396562576, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9295247197151184, "rewards/symbolic_reward_partial_score/std": 0.22601304948329926, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0951886177062988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 272.0, "sampling/sampling_logp_difference/mean": 1.9224200248718262, "step": 1169 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33099737763404846, "epoch": 3.0789473684210527, "grad_norm": 0.010425153188407421, "learning_rate": 1e-06, "loss": 0.1494, "step": 1170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3235698342323303, "epoch": 3.081578947368421, "grad_norm": 0.013261355459690094, "learning_rate": 1e-06, "loss": 0.1044, "step": 1171 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3331771790981293, "epoch": 3.0842105263157893, "grad_norm": 0.008659729734063148, "learning_rate": 1e-06, "loss": 0.2015, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15931.0, "completions/mean_length": 1108.689453125, "completions/mean_terminated_length": 552.0991821289062, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.33606918156147003, "epoch": 3.086842105263158, "frac_reward_zero_std": 0.46875, "grad_norm": 340.97711181640625, "learning_rate": 1e-06, "loss": 0.1414, "num_tokens": 377781745.0, "reward": 0.8523913025856018, "reward_std": 0.13625648617744446, "rewards/progression_diversity/mean": -0.014778539538383484, "rewards/progression_diversity/std": 0.07598426192998886, "rewards/symbolic_reward_accuracy/mean": 0.947265625, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.9596354365348816, "rewards/symbolic_reward_partial_score/std": 0.185786634683609, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0950132608413696, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 274.0, "sampling/sampling_logp_difference/mean": 1.4215747117996216, "step": 1173 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3379185199737549, "epoch": 3.0894736842105264, "grad_norm": 0.018991535529494286, "learning_rate": 1e-06, "loss": 0.1565, "step": 1174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.33449527621269226, "epoch": 3.0921052631578947, "grad_norm": 0.005514142569154501, "learning_rate": 1e-06, "loss": 0.062, "step": 1175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.32939158380031586, "epoch": 3.094736842105263, "grad_norm": 0.005196165293455124, "learning_rate": 1e-06, "loss": 0.0599, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15172.0, "completions/mean_length": 1775.599609375, "completions/mean_terminated_length": 604.4619750976562, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.3369821459054947, "epoch": 3.0973684210526318, "frac_reward_zero_std": 0.3125, "grad_norm": 145.88417053222656, "learning_rate": 1e-06, "loss": 0.1591, "num_tokens": 379096324.0, "reward": 0.7732741236686707, "reward_std": 0.21388080716133118, "rewards/progression_diversity/mean": -0.026106324046850204, "rewards/progression_diversity/std": 0.09331251680850983, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.9078775644302368, "rewards/symbolic_reward_partial_score/std": 0.2603663206100464, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.103142499923706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 276.0, "sampling/sampling_logp_difference/mean": 2.0799942016601562, "step": 1177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.34052442014217377, "epoch": 3.1, "grad_norm": 0.011826397851109505, "learning_rate": 1e-06, "loss": 0.1395, "step": 1178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3255487084388733, "epoch": 3.1026315789473684, "grad_norm": 0.008594873361289501, "learning_rate": 1e-06, "loss": 0.1081, "step": 1179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3336612284183502, "epoch": 3.1052631578947367, "grad_norm": 0.01105382852256298, "learning_rate": 1e-06, "loss": 0.1773, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 1904.052734375, "completions/mean_terminated_length": 576.4669799804688, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.3368203490972519, "epoch": 3.1078947368421055, "frac_reward_zero_std": 0.28125, "grad_norm": 61.46799087524414, "learning_rate": 1e-06, "loss": 0.0739, "num_tokens": 380476511.0, "reward": 0.7395263910293579, "reward_std": 0.2011542022228241, "rewards/progression_diversity/mean": -0.026853924617171288, "rewards/progression_diversity/std": 0.09222765266895294, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.88525390625, "rewards/symbolic_reward_partial_score/std": 0.2737979590892792, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0973585844039917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 276.0, "sampling/sampling_logp_difference/mean": 1.8168814182281494, "step": 1181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3273409307003021, "epoch": 3.110526315789474, "grad_norm": 0.014104950241744518, "learning_rate": 1e-06, "loss": 0.1636, "step": 1182 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.315226212143898, "epoch": 3.113157894736842, "grad_norm": 0.3530890643596649, "learning_rate": 1e-06, "loss": 0.1942, "step": 1183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3299044966697693, "epoch": 3.1157894736842104, "grad_norm": 0.017564065754413605, "learning_rate": 1e-06, "loss": 0.1096, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10168.0, "completions/mean_length": 1682.30859375, "completions/mean_terminated_length": 603.568115234375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.32700584828853607, "epoch": 3.1184210526315788, "frac_reward_zero_std": 0.34375, "grad_norm": 11.54803466796875, "learning_rate": 1e-06, "loss": 0.0785, "num_tokens": 381744989.0, "reward": 0.7641712427139282, "reward_std": 0.18420130014419556, "rewards/progression_diversity/mean": -0.023311495780944824, "rewards/progression_diversity/std": 0.08777539432048798, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.90478515625, "rewards/symbolic_reward_partial_score/std": 0.2593802213668823, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0980756282806396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 278.0, "sampling/sampling_logp_difference/mean": 1.7065038681030273, "step": 1185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3249793201684952, "epoch": 3.1210526315789475, "grad_norm": 242.987060546875, "learning_rate": 1e-06, "loss": 0.157, "step": 1186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3353985846042633, "epoch": 3.123684210526316, "grad_norm": 0.011672238819301128, "learning_rate": 1e-06, "loss": 0.164, "step": 1187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3257022500038147, "epoch": 3.126315789473684, "grad_norm": 0.03746514022350311, "learning_rate": 1e-06, "loss": 0.1344, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 2012.21484375, "completions/mean_terminated_length": 593.540771484375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.326521098613739, "epoch": 3.1289473684210525, "frac_reward_zero_std": 0.28125, "grad_norm": 233.8143310546875, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 383184043.0, "reward": 0.7504065632820129, "reward_std": 0.20918390154838562, "rewards/progression_diversity/mean": -0.027706898748874664, "rewards/progression_diversity/std": 0.09274362027645111, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8850911855697632, "rewards/symbolic_reward_partial_score/std": 0.289438933134079, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1031694412231445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 278.0, "sampling/sampling_logp_difference/mean": 1.8653408288955688, "step": 1189 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.32067760825157166, "epoch": 3.1315789473684212, "grad_norm": 49.3584098815918, "learning_rate": 1e-06, "loss": 0.1867, "step": 1190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3191855549812317, "epoch": 3.1342105263157896, "grad_norm": 0.021937908604741096, "learning_rate": 1e-06, "loss": 0.2106, "step": 1191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.30871887505054474, "epoch": 3.136842105263158, "grad_norm": 0.017672132700681686, "learning_rate": 1e-06, "loss": 0.1299, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7855.0, "completions/mean_length": 2210.833984375, "completions/mean_terminated_length": 608.6499633789062, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.3056802898645401, "epoch": 3.139473684210526, "frac_reward_zero_std": 0.1875, "grad_norm": 79.18196868896484, "learning_rate": 1e-06, "loss": 0.2021, "num_tokens": 384727958.0, "reward": 0.754828691482544, "reward_std": 0.21231617033481598, "rewards/progression_diversity/mean": -0.03471089154481888, "rewards/progression_diversity/std": 0.10704705864191055, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, "rewards/symbolic_reward_partial_score/std": 0.29755792021751404, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1085774898529053, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 280.0, "sampling/sampling_logp_difference/mean": 2.3273110389709473, "step": 1193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3188657760620117, "epoch": 3.1421052631578945, "grad_norm": 0.007435488048940897, "learning_rate": 1e-06, "loss": 0.1936, "step": 1194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.31570450961589813, "epoch": 3.1447368421052633, "grad_norm": 0.71424400806427, "learning_rate": 1e-06, "loss": 0.1574, "step": 1195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.3103174865245819, "epoch": 3.1473684210526316, "grad_norm": 2.483917474746704, "learning_rate": 1e-06, "loss": 0.2062, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11863.0, "completions/mean_length": 2258.6640625, "completions/mean_terminated_length": 593.2314453125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.3229822814464569, "epoch": 3.15, "frac_reward_zero_std": 0.25, "grad_norm": 142.69839477539062, "learning_rate": 1e-06, "loss": 0.1636, "num_tokens": 386288202.0, "reward": 0.7562662363052368, "reward_std": 0.2117644101381302, "rewards/progression_diversity/mean": -0.032554611563682556, "rewards/progression_diversity/std": 0.09795653074979782, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.8787435293197632, "rewards/symbolic_reward_partial_score/std": 0.3041481375694275, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.106873869895935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 280.0, "sampling/sampling_logp_difference/mean": 1.8862922191619873, "step": 1197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.31533288955688477, "epoch": 3.1526315789473682, "grad_norm": 63.04111099243164, "learning_rate": 1e-06, "loss": 0.1067, "step": 1198 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3111482411623001, "epoch": 3.155263157894737, "grad_norm": 0.010430481284856796, "learning_rate": 1e-06, "loss": 0.2744, "step": 1199 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.32699713110923767, "epoch": 3.1578947368421053, "grad_norm": 79.19801330566406, "learning_rate": 1e-06, "loss": 0.1564, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 2007.53125, "completions/mean_terminated_length": 588.3948364257812, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.3115497827529907, "epoch": 3.1605263157894736, "frac_reward_zero_std": 0.28125, "grad_norm": 73.29586029052734, "learning_rate": 1e-06, "loss": 0.1916, "num_tokens": 387721850.0, "reward": 0.7630906701087952, "reward_std": 0.19893735647201538, "rewards/progression_diversity/mean": -0.028826594352722168, "rewards/progression_diversity/std": 0.09443072229623795, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.8981119394302368, "rewards/symbolic_reward_partial_score/std": 0.2708629369735718, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.108184576034546, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 282.0, "sampling/sampling_logp_difference/mean": 2.00418758392334, "step": 1201 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.32552970945835114, "epoch": 3.163157894736842, "grad_norm": 6.941614627838135, "learning_rate": 1e-06, "loss": 0.1422, "step": 1202 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.32488586008548737, "epoch": 3.1657894736842107, "grad_norm": 0.01468179002404213, "learning_rate": 1e-06, "loss": 0.1705, "step": 1203 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30805665254592896, "epoch": 3.168421052631579, "grad_norm": 0.013095368631184101, "learning_rate": 1e-06, "loss": 0.2168, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9420.0, "completions/mean_length": 1713.927734375, "completions/mean_terminated_length": 571.206298828125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3258505165576935, "epoch": 3.1710526315789473, "frac_reward_zero_std": 0.34375, "grad_norm": 112.94711303710938, "learning_rate": 1e-06, "loss": 0.1194, "num_tokens": 388982517.0, "reward": 0.8024517297744751, "reward_std": 0.18236470222473145, "rewards/progression_diversity/mean": -0.023384764790534973, "rewards/progression_diversity/std": 0.08616573363542557, "rewards/symbolic_reward_accuracy/mean": 0.888671875, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.9152017831802368, "rewards/symbolic_reward_partial_score/std": 0.25712403655052185, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1038358211517334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 284.0, "sampling/sampling_logp_difference/mean": 1.693922996520996, "step": 1205 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3253411054611206, "epoch": 3.1736842105263157, "grad_norm": 4.749903202056885, "learning_rate": 1e-06, "loss": 0.229, "step": 1206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.32736538350582123, "epoch": 3.1763157894736844, "grad_norm": 0.008272160775959492, "learning_rate": 1e-06, "loss": 0.1404, "step": 1207 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.32286109030246735, "epoch": 3.1789473684210527, "grad_norm": 0.004888339899480343, "learning_rate": 1e-06, "loss": 0.1054, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15292.0, "completions/mean_length": 1806.24609375, "completions/mean_terminated_length": 570.8432006835938, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.3235931098461151, "epoch": 3.181578947368421, "frac_reward_zero_std": 0.40625, "grad_norm": 25.39488983154297, "learning_rate": 1e-06, "loss": 0.1452, "num_tokens": 390291091.0, "reward": 0.8004156351089478, "reward_std": 0.1887449473142624, "rewards/progression_diversity/mean": -0.026793645694851875, "rewards/progression_diversity/std": 0.09325826168060303, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9176431894302368, "rewards/symbolic_reward_partial_score/std": 0.25725844502449036, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1049944162368774, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 288.0, "sampling/sampling_logp_difference/mean": 1.934594750404358, "step": 1209 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3248671740293503, "epoch": 3.1842105263157894, "grad_norm": 0.007971227169036865, "learning_rate": 1e-06, "loss": 0.1074, "step": 1210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.311784565448761, "epoch": 3.1868421052631577, "grad_norm": 15.894944190979004, "learning_rate": 1e-06, "loss": 0.188, "step": 1211 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3261888176202774, "epoch": 3.1894736842105265, "grad_norm": 0.019114594906568527, "learning_rate": 1e-06, "loss": 0.1449, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9429.0, "completions/mean_length": 1767.72265625, "completions/mean_terminated_length": 629.1915283203125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.3183141201734543, "epoch": 3.192105263157895, "frac_reward_zero_std": 0.3125, "grad_norm": 201.4045867919922, "learning_rate": 1e-06, "loss": 0.1038, "num_tokens": 391601637.0, "reward": 0.793647050857544, "reward_std": 0.20239751040935516, "rewards/progression_diversity/mean": -0.02494898810982704, "rewards/progression_diversity/std": 0.08966317027807236, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9158528447151184, "rewards/symbolic_reward_partial_score/std": 0.2540964186191559, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1079578399658203, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 290.0, "sampling/sampling_logp_difference/mean": 2.028384208679199, "step": 1213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31385740637779236, "epoch": 3.194736842105263, "grad_norm": 0.025811778381466866, "learning_rate": 1e-06, "loss": 0.1903, "step": 1214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3017711192369461, "epoch": 3.1973684210526314, "grad_norm": 148.46176147460938, "learning_rate": 1e-06, "loss": 0.243, "step": 1215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3152967840433121, "epoch": 3.2, "grad_norm": 6.499451160430908, "learning_rate": 1e-06, "loss": 0.0559, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12806.0, "completions/mean_length": 2251.021484375, "completions/mean_terminated_length": 653.3804321289062, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.30623379349708557, "epoch": 3.2026315789473685, "frac_reward_zero_std": 0.28125, "grad_norm": 91.87493133544922, "learning_rate": 1e-06, "loss": 0.1956, "num_tokens": 393178320.0, "reward": 0.7417978048324585, "reward_std": 0.22114922106266022, "rewards/progression_diversity/mean": -0.029206208884716034, "rewards/progression_diversity/std": 0.09235595166683197, "rewards/symbolic_reward_accuracy/mean": 0.81640625, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.8688150644302368, "rewards/symbolic_reward_partial_score/std": 0.30946817994117737, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1069762706756592, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 292.0, "sampling/sampling_logp_difference/mean": 1.9247076511383057, "step": 1217 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30731241405010223, "epoch": 3.205263157894737, "grad_norm": 6.715027809143066, "learning_rate": 1e-06, "loss": 0.1788, "step": 1218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2882115989923477, "epoch": 3.207894736842105, "grad_norm": 0.01562909595668316, "learning_rate": 1e-06, "loss": 0.2044, "step": 1219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30942070484161377, "epoch": 3.2105263157894735, "grad_norm": 0.34386348724365234, "learning_rate": 1e-06, "loss": 0.1043, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 2317.568359375, "completions/mean_terminated_length": 555.4000244140625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.3095013201236725, "epoch": 3.213157894736842, "frac_reward_zero_std": 0.15625, "grad_norm": 89.25138092041016, "learning_rate": 1e-06, "loss": 0.134, "num_tokens": 394758899.0, "reward": 0.7222451567649841, "reward_std": 0.2374163269996643, "rewards/progression_diversity/mean": -0.03622689098119736, "rewards/progression_diversity/std": 0.10711545497179031, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.86572265625, "rewards/symbolic_reward_partial_score/std": 0.30833473801612854, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.108048915863037, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 292.0, "sampling/sampling_logp_difference/mean": 2.0354158878326416, "step": 1221 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.30622752010822296, "epoch": 3.2157894736842105, "grad_norm": 0.01329483650624752, "learning_rate": 1e-06, "loss": 0.2269, "step": 1222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2995607703924179, "epoch": 3.218421052631579, "grad_norm": 0.011663331650197506, "learning_rate": 1e-06, "loss": 0.1555, "step": 1223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.29136380553245544, "epoch": 3.221052631578947, "grad_norm": 0.02316325716674328, "learning_rate": 1e-06, "loss": 0.2399, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14746.0, "completions/mean_length": 1491.333984375, "completions/mean_terminated_length": 629.7747802734375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.31069743633270264, "epoch": 3.223684210526316, "frac_reward_zero_std": 0.3125, "grad_norm": 37.16641616821289, "learning_rate": 1e-06, "loss": 0.1773, "num_tokens": 395935454.0, "reward": 0.8113774061203003, "reward_std": 0.20781219005584717, "rewards/progression_diversity/mean": -0.019492559134960175, "rewards/progression_diversity/std": 0.07982630282640457, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9226887822151184, "rewards/symbolic_reward_partial_score/std": 0.2527928054332733, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.096280813217163, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 292.0, "sampling/sampling_logp_difference/mean": 1.8208186626434326, "step": 1225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30570439994335175, "epoch": 3.2263157894736842, "grad_norm": 0.007866773754358292, "learning_rate": 1e-06, "loss": 0.215, "step": 1226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.32926367223262787, "epoch": 3.2289473684210526, "grad_norm": 0.04123647138476372, "learning_rate": 1e-06, "loss": 0.0619, "step": 1227 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.32413050532341003, "epoch": 3.231578947368421, "grad_norm": 0.04225276783108711, "learning_rate": 1e-06, "loss": 0.1256, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10278.0, "completions/mean_length": 1581.517578125, "completions/mean_terminated_length": 594.6854248046875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.3033265024423599, "epoch": 3.2342105263157896, "frac_reward_zero_std": 0.34375, "grad_norm": 168.14776611328125, "learning_rate": 1e-06, "loss": 0.2286, "num_tokens": 397158023.0, "reward": 0.8106723427772522, "reward_std": 0.19477233290672302, "rewards/progression_diversity/mean": -0.021638944745063782, "rewards/progression_diversity/std": 0.08424000442028046, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9230143427848816, "rewards/symbolic_reward_partial_score/std": 0.2554587125778198, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0959186553955078, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 292.0, "sampling/sampling_logp_difference/mean": 1.717806100845337, "step": 1229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.32844409346580505, "epoch": 3.236842105263158, "grad_norm": 10.585503578186035, "learning_rate": 1e-06, "loss": 0.1134, "step": 1230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31832896173000336, "epoch": 3.2394736842105263, "grad_norm": 1.0383377075195312, "learning_rate": 1e-06, "loss": 0.1613, "step": 1231 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.33217278122901917, "epoch": 3.2421052631578946, "grad_norm": 0.013041791506111622, "learning_rate": 1e-06, "loss": 0.0293, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13950.0, "completions/mean_length": 1890.353515625, "completions/mean_terminated_length": 561.51171875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.30604319274425507, "epoch": 3.2447368421052634, "frac_reward_zero_std": 0.25, "grad_norm": 29.466703414916992, "learning_rate": 1e-06, "loss": 0.1197, "num_tokens": 398532860.0, "reward": 0.7769619226455688, "reward_std": 0.22913838922977448, "rewards/progression_diversity/mean": -0.028422629460692406, "rewards/progression_diversity/std": 0.09470929205417633, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9046223759651184, "rewards/symbolic_reward_partial_score/std": 0.27007627487182617, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.101804256439209, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 294.0, "sampling/sampling_logp_difference/mean": 1.9203028678894043, "step": 1233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.31168776750564575, "epoch": 3.2473684210526317, "grad_norm": 59.52106857299805, "learning_rate": 1e-06, "loss": 0.2153, "step": 1234 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3222711682319641, "epoch": 3.25, "grad_norm": 0.33637794852256775, "learning_rate": 1e-06, "loss": 0.0839, "step": 1235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.30836866796016693, "epoch": 3.2526315789473683, "grad_norm": 0.019555335864424706, "learning_rate": 1e-06, "loss": 0.1737, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12113.0, "completions/mean_length": 3351.146484375, "completions/mean_terminated_length": 683.2446899414062, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.2953759431838989, "epoch": 3.2552631578947366, "frac_reward_zero_std": 0.21875, "grad_norm": 87.26698303222656, "learning_rate": 1e-06, "loss": 0.1332, "num_tokens": 400674471.0, "reward": 0.6438003182411194, "reward_std": 0.23994354903697968, "rewards/progression_diversity/mean": -0.05356261506676674, "rewards/progression_diversity/std": 0.12248878180980682, "rewards/symbolic_reward_accuracy/mean": 0.703125, "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, "rewards/symbolic_reward_partial_score/mean": 0.7936197519302368, "rewards/symbolic_reward_partial_score/std": 0.36749520897865295, "rewards/tag_count_reward/mean": -0.15625, "rewards/tag_count_reward/std": 0.36344730854034424, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1080725193023682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 294.0, "sampling/sampling_logp_difference/mean": 2.1226110458374023, "step": 1237 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.30956192314624786, "epoch": 3.2578947368421054, "grad_norm": 9.49582290649414, "learning_rate": 1e-06, "loss": 0.1708, "step": 1238 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.27571606636047363, "epoch": 3.2605263157894737, "grad_norm": 0.008884378708899021, "learning_rate": 1e-06, "loss": 0.3496, "step": 1239 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.27126601338386536, "epoch": 3.263157894736842, "grad_norm": 18.252225875854492, "learning_rate": 1e-06, "loss": 0.2307, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 1877.640625, "completions/mean_terminated_length": 513.794921875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.32085326313972473, "epoch": 3.2657894736842104, "frac_reward_zero_std": 0.28125, "grad_norm": 87.52373504638672, "learning_rate": 1e-06, "loss": 0.0879, "num_tokens": 402027599.0, "reward": 0.7780733108520508, "reward_std": 0.20871524512767792, "rewards/progression_diversity/mean": -0.029581986367702484, "rewards/progression_diversity/std": 0.09899447858333588, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.90185546875, "rewards/symbolic_reward_partial_score/std": 0.2770686745643616, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0994126796722412, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 294.0, "sampling/sampling_logp_difference/mean": 2.019242286682129, "step": 1241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.30657218396663666, "epoch": 3.268421052631579, "grad_norm": 0.007070607040077448, "learning_rate": 1e-06, "loss": 0.1977, "step": 1242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.30736126005649567, "epoch": 3.2710526315789474, "grad_norm": 0.016214022412896156, "learning_rate": 1e-06, "loss": 0.2739, "step": 1243 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.32670439779758453, "epoch": 3.2736842105263158, "grad_norm": 0.010165052488446236, "learning_rate": 1e-06, "loss": 0.1227, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12475.0, "completions/mean_length": 2460.5546875, "completions/mean_terminated_length": 577.3392333984375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.29750363528728485, "epoch": 3.276315789473684, "frac_reward_zero_std": 0.28125, "grad_norm": 246.82859802246094, "learning_rate": 1e-06, "loss": 0.2205, "num_tokens": 403684715.0, "reward": 0.7260443568229675, "reward_std": 0.22644254565238953, "rewards/progression_diversity/mean": -0.03717077150940895, "rewards/progression_diversity/std": 0.10484474152326584, "rewards/symbolic_reward_accuracy/mean": 0.798828125, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.8575845956802368, "rewards/symbolic_reward_partial_score/std": 0.3189500868320465, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1037794351577759, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 296.0, "sampling/sampling_logp_difference/mean": 2.2273709774017334, "step": 1245 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.31314218044281006, "epoch": 3.2789473684210524, "grad_norm": 4.193942070007324, "learning_rate": 1e-06, "loss": 0.1196, "step": 1246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.309044748544693, "epoch": 3.281578947368421, "grad_norm": 0.0659249871969223, "learning_rate": 1e-06, "loss": 0.1756, "step": 1247 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.285678505897522, "epoch": 3.2842105263157895, "grad_norm": 0.006575907580554485, "learning_rate": 1e-06, "loss": 0.2906, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 2114.935546875, "completions/mean_terminated_length": 536.3622436523438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.30202820897102356, "epoch": 3.286842105263158, "frac_reward_zero_std": 0.25, "grad_norm": 161.16702270507812, "learning_rate": 1e-06, "loss": 0.2409, "num_tokens": 405162026.0, "reward": 0.7425325512886047, "reward_std": 0.2102564126253128, "rewards/progression_diversity/mean": -0.028976155444979668, "rewards/progression_diversity/std": 0.09275542944669724, "rewards/symbolic_reward_accuracy/mean": 0.814453125, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.87451171875, "rewards/symbolic_reward_partial_score/std": 0.3029172122478485, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1059821844100952, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 296.0, "sampling/sampling_logp_difference/mean": 2.2559051513671875, "step": 1249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.32025621831417084, "epoch": 3.2894736842105265, "grad_norm": 0.03536779060959816, "learning_rate": 1e-06, "loss": 0.1425, "step": 1250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.29748618602752686, "epoch": 3.292105263157895, "grad_norm": 1.4454317092895508, "learning_rate": 1e-06, "loss": 0.2194, "step": 1251 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3038511127233505, "epoch": 3.294736842105263, "grad_norm": 0.013857961632311344, "learning_rate": 1e-06, "loss": 0.1161, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10572.0, "completions/mean_length": 2446.869140625, "completions/mean_terminated_length": 596.8074951171875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.3028118759393692, "epoch": 3.2973684210526315, "frac_reward_zero_std": 0.125, "grad_norm": 439.4346008300781, "learning_rate": 1e-06, "loss": 0.2289, "num_tokens": 406818631.0, "reward": 0.7258546948432922, "reward_std": 0.24171996116638184, "rewards/progression_diversity/mean": -0.04148336872458458, "rewards/progression_diversity/std": 0.1141848936676979, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.8564453125, "rewards/symbolic_reward_partial_score/std": 0.33105501532554626, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1173810958862305, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 296.0, "sampling/sampling_logp_difference/mean": 3.1980838775634766, "step": 1253 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3106544762849808, "epoch": 3.3, "grad_norm": 0.009494182653725147, "learning_rate": 1e-06, "loss": 0.1479, "step": 1254 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2966717481613159, "epoch": 3.3026315789473686, "grad_norm": 3.8081188201904297, "learning_rate": 1e-06, "loss": 0.2924, "step": 1255 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.29619279503822327, "epoch": 3.305263157894737, "grad_norm": 0.04518857225775719, "learning_rate": 1e-06, "loss": 0.31, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13429.0, "completions/mean_length": 2205.7890625, "completions/mean_terminated_length": 534.1222534179688, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.3126777410507202, "epoch": 3.307894736842105, "frac_reward_zero_std": 0.15625, "grad_norm": 185.87486267089844, "learning_rate": 1e-06, "loss": 0.1568, "num_tokens": 408320763.0, "reward": 0.7695430517196655, "reward_std": 0.2663387060165405, "rewards/progression_diversity/mean": -0.03788202255964279, "rewards/progression_diversity/std": 0.1108899936079979, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.8873697519302368, "rewards/symbolic_reward_partial_score/std": 0.2990962564945221, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1089106798171997, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 296.0, "sampling/sampling_logp_difference/mean": 2.924543857574463, "step": 1257 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2874736040830612, "epoch": 3.3105263157894735, "grad_norm": 0.3372192084789276, "learning_rate": 1e-06, "loss": 0.3459, "step": 1258 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3138102889060974, "epoch": 3.3131578947368423, "grad_norm": 0.008773678913712502, "learning_rate": 1e-06, "loss": 0.116, "step": 1259 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.28731557726860046, "epoch": 3.3157894736842106, "grad_norm": 0.016135241836309433, "learning_rate": 1e-06, "loss": 0.2797, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8639.0, "completions/mean_length": 2694.34375, "completions/mean_terminated_length": 597.729736328125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.2984755337238312, "epoch": 3.318421052631579, "frac_reward_zero_std": 0.34375, "grad_norm": 48.49712371826172, "learning_rate": 1e-06, "loss": 0.1345, "num_tokens": 410128779.0, "reward": 0.7372407913208008, "reward_std": 0.21289455890655518, "rewards/progression_diversity/mean": -0.04057057946920395, "rewards/progression_diversity/std": 0.10670918226242065, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.85986328125, "rewards/symbolic_reward_partial_score/std": 0.32551804184913635, "rewards/tag_count_reward/mean": -0.125, "rewards/tag_count_reward/std": 0.3310423493385315, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.096588373184204, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 296.0, "sampling/sampling_logp_difference/mean": 2.3006858825683594, "step": 1261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2700554430484772, "epoch": 3.3210526315789473, "grad_norm": 0.009636012837290764, "learning_rate": 1e-06, "loss": 0.3738, "step": 1262 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.30288518965244293, "epoch": 3.3236842105263156, "grad_norm": 0.275381863117218, "learning_rate": 1e-06, "loss": 0.1203, "step": 1263 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3005267381668091, "epoch": 3.3263157894736843, "grad_norm": 0.15126138925552368, "learning_rate": 1e-06, "loss": 0.1354, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15602.0, "completions/mean_length": 2717.74609375, "completions/mean_terminated_length": 624.7162475585938, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.29306742548942566, "epoch": 3.3289473684210527, "frac_reward_zero_std": 0.125, "grad_norm": 84.46516418457031, "learning_rate": 1e-06, "loss": 0.2436, "num_tokens": 411934761.0, "reward": 0.6926791071891785, "reward_std": 0.26792874932289124, "rewards/progression_diversity/mean": -0.03872973471879959, "rewards/progression_diversity/std": 0.10431662201881409, "rewards/symbolic_reward_accuracy/mean": 0.755859375, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.8388671875, "rewards/symbolic_reward_partial_score/std": 0.327902227640152, "rewards/tag_count_reward/mean": -0.12109375, "rewards/tag_count_reward/std": 0.3265552520751953, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.09934663772583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 298.0, "sampling/sampling_logp_difference/mean": 3.240568161010742, "step": 1265 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.28837116062641144, "epoch": 3.331578947368421, "grad_norm": 42.86471939086914, "learning_rate": 1e-06, "loss": 0.2322, "step": 1266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.3078329712152481, "epoch": 3.3342105263157893, "grad_norm": 0.009662003256380558, "learning_rate": 1e-06, "loss": 0.1655, "step": 1267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.2638581395149231, "epoch": 3.336842105263158, "grad_norm": 0.007266658823937178, "learning_rate": 1e-06, "loss": 0.2735, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 2193.158203125, "completions/mean_terminated_length": 554.5642700195312, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.3069176971912384, "epoch": 3.3394736842105264, "frac_reward_zero_std": 0.28125, "grad_norm": 140.47747802734375, "learning_rate": 1e-06, "loss": 0.1502, "num_tokens": 413436602.0, "reward": 0.7662738561630249, "reward_std": 0.20352765917778015, "rewards/progression_diversity/mean": -0.032773204147815704, "rewards/progression_diversity/std": 0.09917779266834259, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.9016926884651184, "rewards/symbolic_reward_partial_score/std": 0.27043241262435913, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093727946281433, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 298.0, "sampling/sampling_logp_difference/mean": 3.0766854286193848, "step": 1269 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.32548436522483826, "epoch": 3.3421052631578947, "grad_norm": 0.011535858735442162, "learning_rate": 1e-06, "loss": 0.1011, "step": 1270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2972973585128784, "epoch": 3.344736842105263, "grad_norm": 0.00575890950858593, "learning_rate": 1e-06, "loss": 0.2421, "step": 1271 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2989150583744049, "epoch": 3.3473684210526318, "grad_norm": 0.008553121238946915, "learning_rate": 1e-06, "loss": 0.2467, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1971.931640625, "completions/mean_terminated_length": 583.1884155273438, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.31215590238571167, "epoch": 3.35, "frac_reward_zero_std": 0.21875, "grad_norm": 132.39788818359375, "learning_rate": 1e-06, "loss": 0.1477, "num_tokens": 414865399.0, "reward": 0.7922816276550293, "reward_std": 0.2368476241827011, "rewards/progression_diversity/mean": -0.029654610902071, "rewards/progression_diversity/std": 0.09719457477331161, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.904296875, "rewards/symbolic_reward_partial_score/std": 0.28047847747802734, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0867669582366943, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 300.0, "sampling/sampling_logp_difference/mean": 3.5754165649414062, "step": 1273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3091353476047516, "epoch": 3.3526315789473684, "grad_norm": 0.010651475749909878, "learning_rate": 1e-06, "loss": 0.1685, "step": 1274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.30523787438869476, "epoch": 3.3552631578947367, "grad_norm": 0.008251101709902287, "learning_rate": 1e-06, "loss": 0.2036, "step": 1275 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.29036298394203186, "epoch": 3.3578947368421055, "grad_norm": 0.012122713960707188, "learning_rate": 1e-06, "loss": 0.2905, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1901.666015625, "completions/mean_terminated_length": 573.8614501953125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.30242083966732025, "epoch": 3.360526315789474, "frac_reward_zero_std": 0.25, "grad_norm": 185.1011199951172, "learning_rate": 1e-06, "loss": 0.2087, "num_tokens": 416246188.0, "reward": 0.7942166328430176, "reward_std": 0.22256335616111755, "rewards/progression_diversity/mean": -0.026581626385450363, "rewards/progression_diversity/std": 0.0894814282655716, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9178059697151184, "rewards/symbolic_reward_partial_score/std": 0.25296953320503235, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0797239542007446, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 300.0, "sampling/sampling_logp_difference/mean": 3.349595069885254, "step": 1277 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.30199746787548065, "epoch": 3.363157894736842, "grad_norm": 0.02538270130753517, "learning_rate": 1e-06, "loss": 0.2414, "step": 1278 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3231187015771866, "epoch": 3.3657894736842104, "grad_norm": 0.012012549676001072, "learning_rate": 1e-06, "loss": 0.1036, "step": 1279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3105398118495941, "epoch": 3.3684210526315788, "grad_norm": 0.013158765621483326, "learning_rate": 1e-06, "loss": 0.1455, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 2354.533203125, "completions/mean_terminated_length": 596.99560546875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.30621521174907684, "epoch": 3.3710526315789475, "frac_reward_zero_std": 0.25, "grad_norm": 140.82589721679688, "learning_rate": 1e-06, "loss": 0.1302, "num_tokens": 417859869.0, "reward": 0.7398780584335327, "reward_std": 0.2521520256996155, "rewards/progression_diversity/mean": -0.0356326699256897, "rewards/progression_diversity/std": 0.10271129757165909, "rewards/symbolic_reward_accuracy/mean": 0.81640625, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.8697916269302368, "rewards/symbolic_reward_partial_score/std": 0.3091786503791809, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0742948055267334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 302.0, "sampling/sampling_logp_difference/mean": 3.784925699234009, "step": 1281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.29408055543899536, "epoch": 3.373684210526316, "grad_norm": 0.010642572306096554, "learning_rate": 1e-06, "loss": 0.1868, "step": 1282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.29353274405002594, "epoch": 3.376315789473684, "grad_norm": 0.01788030005991459, "learning_rate": 1e-06, "loss": 0.1809, "step": 1283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2797112911939621, "epoch": 3.3789473684210525, "grad_norm": 0.006915436126291752, "learning_rate": 1e-06, "loss": 0.2527, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16124.0, "completions/mean_length": 1972.298828125, "completions/mean_terminated_length": 617.3526000976562, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.33247339725494385, "epoch": 3.3815789473684212, "frac_reward_zero_std": 0.25, "grad_norm": 348.1959228515625, "learning_rate": 1e-06, "loss": 0.0597, "num_tokens": 419281174.0, "reward": 0.793836772441864, "reward_std": 0.20707328617572784, "rewards/progression_diversity/mean": -0.030389215797185898, "rewards/progression_diversity/std": 0.09979861974716187, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9153646230697632, "rewards/symbolic_reward_partial_score/std": 0.25428155064582825, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062885046005249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 3.6513113975524902, "step": 1285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3000902533531189, "epoch": 3.3842105263157896, "grad_norm": 0.006365468725562096, "learning_rate": 1e-06, "loss": 0.1583, "step": 1286 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2907492071390152, "epoch": 3.386842105263158, "grad_norm": 0.013992332853376865, "learning_rate": 1e-06, "loss": 0.2211, "step": 1287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2959248870611191, "epoch": 3.389473684210526, "grad_norm": 0.006391443312168121, "learning_rate": 1e-06, "loss": 0.2083, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9612.0, "completions/mean_length": 2112.052734375, "completions/mean_terminated_length": 635.6444091796875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.3044814467430115, "epoch": 3.3921052631578945, "frac_reward_zero_std": 0.15625, "grad_norm": 314.1883239746094, "learning_rate": 1e-06, "loss": 0.1179, "num_tokens": 420783377.0, "reward": 0.77393639087677, "reward_std": 0.25700780749320984, "rewards/progression_diversity/mean": -0.03312736004590988, "rewards/progression_diversity/std": 0.10419656336307526, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.8946939706802368, "rewards/symbolic_reward_partial_score/std": 0.2826669216156006, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506939888000488, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 5.086845397949219, "step": 1289 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.27778656780719757, "epoch": 3.3947368421052633, "grad_norm": 0.013014139607548714, "learning_rate": 1e-06, "loss": 0.2738, "step": 1290 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2996399998664856, "epoch": 3.3973684210526316, "grad_norm": 0.06032712757587433, "learning_rate": 1e-06, "loss": 0.2089, "step": 1291 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.30588841438293457, "epoch": 3.4, "grad_norm": 0.018283555284142494, "learning_rate": 1e-06, "loss": 0.1612, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10146.0, "completions/mean_length": 2171.923828125, "completions/mean_terminated_length": 565.34130859375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.3020727336406708, "epoch": 3.4026315789473682, "frac_reward_zero_std": 0.25, "grad_norm": 125.83609008789062, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 422257322.0, "reward": 0.7886345386505127, "reward_std": 0.19221815466880798, "rewards/progression_diversity/mean": -0.03791151940822601, "rewards/progression_diversity/std": 0.11339190602302551, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9158528447151184, "rewards/symbolic_reward_partial_score/std": 0.25923240184783936, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0494663715362549, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 4.9170942306518555, "step": 1293 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.29434891045093536, "epoch": 3.405263157894737, "grad_norm": 0.01162765920162201, "learning_rate": 1e-06, "loss": 0.1834, "step": 1294 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2950708270072937, "epoch": 3.4078947368421053, "grad_norm": 0.009900042787194252, "learning_rate": 1e-06, "loss": 0.2169, "step": 1295 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30467459559440613, "epoch": 3.4105263157894736, "grad_norm": 0.009985024109482765, "learning_rate": 1e-06, "loss": 0.1849, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15928.0, "completions/mean_length": 3336.40625, "completions/mean_terminated_length": 628.4151000976562, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.2799158841371536, "epoch": 3.413157894736842, "frac_reward_zero_std": 0.125, "grad_norm": 283.4808654785156, "learning_rate": 1e-06, "loss": 0.166, "num_tokens": 424360346.0, "reward": 0.7545630931854248, "reward_std": 0.22717466950416565, "rewards/progression_diversity/mean": -0.06615223735570908, "rewards/progression_diversity/std": 0.14478321373462677, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.87744140625, "rewards/symbolic_reward_partial_score/std": 0.3093447685241699, "rewards/tag_count_reward/mean": -0.119140625, "rewards/tag_count_reward/std": 0.32427072525024414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0230858325958252, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 7.494105815887451, "step": 1297 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.23919285833835602, "epoch": 3.4157894736842107, "grad_norm": 1.23762845993042, "learning_rate": 1e-06, "loss": 0.371, "step": 1298 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.25987882912158966, "epoch": 3.418421052631579, "grad_norm": 0.5271911025047302, "learning_rate": 1e-06, "loss": 0.2374, "step": 1299 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.26133809983730316, "epoch": 3.4210526315789473, "grad_norm": 0.11925064027309418, "learning_rate": 1e-06, "loss": 0.188, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10431.0, "completions/mean_length": 3059.662109375, "completions/mean_terminated_length": 592.192138671875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.2862105369567871, "epoch": 3.4236842105263157, "frac_reward_zero_std": 0.09375, "grad_norm": 341.6363830566406, "learning_rate": 1e-06, "loss": 0.195, "num_tokens": 426325869.0, "reward": 0.7616207599639893, "reward_std": 0.24166487157344818, "rewards/progression_diversity/mean": -0.05862627178430557, "rewards/progression_diversity/std": 0.13662515580654144, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.8831380009651184, "rewards/symbolic_reward_partial_score/std": 0.301278293132782, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0274146795272827, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 7.100645542144775, "step": 1301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.27012303471565247, "epoch": 3.4263157894736844, "grad_norm": 3.0215957164764404, "learning_rate": 1e-06, "loss": 0.2341, "step": 1302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.26916660368442535, "epoch": 3.4289473684210527, "grad_norm": 37.49262619018555, "learning_rate": 1e-06, "loss": 0.1515, "step": 1303 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.27728109061717987, "epoch": 3.431578947368421, "grad_norm": 6.821070194244385, "learning_rate": 1e-06, "loss": 0.2638, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15417.0, "completions/mean_length": 3132.240234375, "completions/mean_terminated_length": 678.2106323242188, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.2735847979784012, "epoch": 3.4342105263157894, "frac_reward_zero_std": 0.125, "grad_norm": 166.9344940185547, "learning_rate": 1e-06, "loss": 0.2523, "num_tokens": 428342568.0, "reward": 0.7301239967346191, "reward_std": 0.27412816882133484, "rewards/progression_diversity/mean": -0.05889313295483589, "rewards/progression_diversity/std": 0.136581152677536, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.8640950918197632, "rewards/symbolic_reward_partial_score/std": 0.31252816319465637, "rewards/tag_count_reward/mean": -0.125, "rewards/tag_count_reward/std": 0.3310423493385315, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0343992710113525, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 306.0, "sampling/sampling_logp_difference/mean": 6.35936164855957, "step": 1305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.264299601316452, "epoch": 3.4368421052631577, "grad_norm": 14.73751449584961, "learning_rate": 1e-06, "loss": 0.1532, "step": 1306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.27560053765773773, "epoch": 3.4394736842105265, "grad_norm": 2.936858892440796, "learning_rate": 1e-06, "loss": 0.2459, "step": 1307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2791202515363693, "epoch": 3.442105263157895, "grad_norm": 0.5196275115013123, "learning_rate": 1e-06, "loss": 0.2228, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 2650.87890625, "completions/mean_terminated_length": 583.1954956054688, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.30482953786849976, "epoch": 3.444736842105263, "frac_reward_zero_std": 0.03125, "grad_norm": 386.9339294433594, "learning_rate": 1e-06, "loss": 0.1641, "num_tokens": 430092778.0, "reward": 0.7358328104019165, "reward_std": 0.2448652684688568, "rewards/progression_diversity/mean": -0.04465062916278839, "rewards/progression_diversity/std": 0.11796204745769501, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.87548828125, "rewards/symbolic_reward_partial_score/std": 0.299261212348938, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0414533615112305, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 304.0, "sampling/sampling_logp_difference/mean": 5.4577789306640625, "step": 1309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.28392690420150757, "epoch": 3.4473684210526314, "grad_norm": 160.2227020263672, "learning_rate": 1e-06, "loss": 0.2389, "step": 1310 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3085232526063919, "epoch": 3.45, "grad_norm": 0.17238354682922363, "learning_rate": 1e-06, "loss": 0.1332, "step": 1311 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3984375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2967042475938797, "epoch": 3.4526315789473685, "grad_norm": 7.176582336425781, "learning_rate": 1e-06, "loss": 0.1358, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9963.0, "completions/mean_length": 2258.275390625, "completions/mean_terminated_length": 627.200439453125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.31084753572940826, "epoch": 3.455263157894737, "frac_reward_zero_std": 0.25, "grad_norm": 914.4140625, "learning_rate": 1e-06, "loss": 0.1569, "num_tokens": 431662167.0, "reward": 0.7634491920471191, "reward_std": 0.23263157904148102, "rewards/progression_diversity/mean": -0.03691425174474716, "rewards/progression_diversity/std": 0.10997199267148972, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.8917642831802368, "rewards/symbolic_reward_partial_score/std": 0.28572434186935425, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0521239042282104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 4.490725994110107, "step": 1313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3135823905467987, "epoch": 3.457894736842105, "grad_norm": 48.58332824707031, "learning_rate": 1e-06, "loss": 0.1558, "step": 1314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.32143357396125793, "epoch": 3.4605263157894735, "grad_norm": 0.030546952039003372, "learning_rate": 1e-06, "loss": 0.1197, "step": 1315 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.29036542773246765, "epoch": 3.463157894736842, "grad_norm": 0.008445881307125092, "learning_rate": 1e-06, "loss": 0.2007, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 2460.689453125, "completions/mean_terminated_length": 612.46240234375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.29718485474586487, "epoch": 3.4657894736842105, "frac_reward_zero_std": 0.15625, "grad_norm": 533.7861938476562, "learning_rate": 1e-06, "loss": 0.1662, "num_tokens": 433313176.0, "reward": 0.7628533840179443, "reward_std": 0.2578679919242859, "rewards/progression_diversity/mean": -0.04279157519340515, "rewards/progression_diversity/std": 0.11989957839250565, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.8880208730697632, "rewards/symbolic_reward_partial_score/std": 0.28819218277931213, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420756340026855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.4957427978515625, "step": 1317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3101320266723633, "epoch": 3.468421052631579, "grad_norm": 0.01260958332568407, "learning_rate": 1e-06, "loss": 0.1788, "step": 1318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.27350659668445587, "epoch": 3.4710526315789476, "grad_norm": 3.119270086288452, "learning_rate": 1e-06, "loss": 0.3215, "step": 1319 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.40625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.3225550502538681, "epoch": 3.473684210526316, "grad_norm": 2.4085659980773926, "learning_rate": 1e-06, "loss": 0.1297, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16143.0, "completions/mean_length": 2217.62109375, "completions/mean_terminated_length": 650.4078369140625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.3091781437397003, "epoch": 3.4763157894736842, "frac_reward_zero_std": 0.25, "grad_norm": 291.2024230957031, "learning_rate": 1e-06, "loss": 0.1649, "num_tokens": 434844726.0, "reward": 0.7853677272796631, "reward_std": 0.206242173910141, "rewards/progression_diversity/mean": -0.03744862973690033, "rewards/progression_diversity/std": 0.11320951581001282, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9075521230697632, "rewards/symbolic_reward_partial_score/std": 0.2654723823070526, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050972580909729, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 4.5054779052734375, "step": 1321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.30050645768642426, "epoch": 3.4789473684210526, "grad_norm": 0.1077585518360138, "learning_rate": 1e-06, "loss": 0.2148, "step": 1322 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3242476284503937, "epoch": 3.481578947368421, "grad_norm": 0.015654178336262703, "learning_rate": 1e-06, "loss": 0.1183, "step": 1323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3109499663114548, "epoch": 3.4842105263157896, "grad_norm": 0.02455841936171055, "learning_rate": 1e-06, "loss": 0.1862, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8314.0, "completions/mean_length": 3150.29296875, "completions/mean_terminated_length": 663.2157592773438, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.27029645442962646, "epoch": 3.486842105263158, "frac_reward_zero_std": 0.125, "grad_norm": 303.6960144042969, "learning_rate": 1e-06, "loss": 0.1961, "num_tokens": 436880172.0, "reward": 0.7058703899383545, "reward_std": 0.2576483488082886, "rewards/progression_diversity/mean": -0.057497672736644745, "rewards/progression_diversity/std": 0.13345807790756226, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.8619791269302368, "rewards/symbolic_reward_partial_score/std": 0.3079785406589508, "rewards/tag_count_reward/mean": -0.138671875, "rewards/tag_count_reward/std": 0.34594178199768066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0368916988372803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.3984527587890625, "step": 1325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2876896560192108, "epoch": 3.4894736842105263, "grad_norm": 0.016265718266367912, "learning_rate": 1e-06, "loss": 0.1761, "step": 1326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.27050982415676117, "epoch": 3.4921052631578946, "grad_norm": 1.3987834453582764, "learning_rate": 1e-06, "loss": 0.269, "step": 1327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.28464990854263306, "epoch": 3.4947368421052634, "grad_norm": 32.02823257446289, "learning_rate": 1e-06, "loss": 0.1709, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10308.0, "completions/mean_length": 1944.478515625, "completions/mean_terminated_length": 654.1383056640625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.2990902215242386, "epoch": 3.4973684210526317, "frac_reward_zero_std": 0.1875, "grad_norm": 411.1249694824219, "learning_rate": 1e-06, "loss": 0.2033, "num_tokens": 438276225.0, "reward": 0.7881462574005127, "reward_std": 0.22861242294311523, "rewards/progression_diversity/mean": -0.03303021937608719, "rewards/progression_diversity/std": 0.1093946024775505, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9140625, "rewards/symbolic_reward_partial_score/std": 0.2569299340248108, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038718819618225, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.05161190032959, "step": 1329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3089918792247772, "epoch": 3.5, "grad_norm": 0.021115725859999657, "learning_rate": 1e-06, "loss": 0.1788, "step": 1330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3167189657688141, "epoch": 3.5026315789473683, "grad_norm": 0.010390168987214565, "learning_rate": 1e-06, "loss": 0.1894, "step": 1331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.33414115011692047, "epoch": 3.5052631578947366, "grad_norm": 0.01447315514087677, "learning_rate": 1e-06, "loss": 0.0922, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3667.0, "completions/mean_length": 2530.16015625, "completions/mean_terminated_length": 621.4088745117188, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.2866598814725876, "epoch": 3.5078947368421054, "frac_reward_zero_std": 0.15625, "grad_norm": 1008.520751953125, "learning_rate": 1e-06, "loss": 0.2519, "num_tokens": 439965971.0, "reward": 0.7736403942108154, "reward_std": 0.23456132411956787, "rewards/progression_diversity/mean": -0.0480697825551033, "rewards/progression_diversity/std": 0.13115154206752777, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.8948568105697632, "rewards/symbolic_reward_partial_score/std": 0.2902930974960327, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0280325412750244, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.589191436767578, "step": 1333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.30402611196041107, "epoch": 3.5105263157894737, "grad_norm": 1195.4456787109375, "learning_rate": 1e-06, "loss": 0.2695, "step": 1334 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.29913991689682007, "epoch": 3.513157894736842, "grad_norm": 0.3809571862220764, "learning_rate": 1e-06, "loss": 0.1482, "step": 1335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.31326740980148315, "epoch": 3.515789473684211, "grad_norm": 1.5359933376312256, "learning_rate": 1e-06, "loss": 0.1986, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10774.0, "completions/mean_length": 2040.724609375, "completions/mean_terminated_length": 624.8648071289062, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.3067469596862793, "epoch": 3.518421052631579, "frac_reward_zero_std": 0.34375, "grad_norm": 130.65467834472656, "learning_rate": 1e-06, "loss": 0.1706, "num_tokens": 441414950.0, "reward": 0.7661923170089722, "reward_std": 0.18958313763141632, "rewards/progression_diversity/mean": -0.03604454547166824, "rewards/progression_diversity/std": 0.11484090983867645, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.9021810293197632, "rewards/symbolic_reward_partial_score/std": 0.2684669494628906, "rewards/tag_count_reward/mean": -0.080078125, "rewards/tag_count_reward/std": 0.271679550409317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0413990020751953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 4.274114608764648, "step": 1337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3064485937356949, "epoch": 3.5210526315789474, "grad_norm": 4.137460708618164, "learning_rate": 1e-06, "loss": 0.1474, "step": 1338 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30475564301013947, "epoch": 3.5236842105263158, "grad_norm": 0.25504910945892334, "learning_rate": 1e-06, "loss": 0.1556, "step": 1339 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3473477065563202, "epoch": 3.526315789473684, "grad_norm": 0.14421966671943665, "learning_rate": 1e-06, "loss": 0.076, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15636.0, "completions/mean_length": 2503.556640625, "completions/mean_terminated_length": 626.1574096679688, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.3233236074447632, "epoch": 3.5289473684210524, "frac_reward_zero_std": 0.21875, "grad_norm": 764.384765625, "learning_rate": 1e-06, "loss": 0.1338, "num_tokens": 443064707.0, "reward": 0.7563366889953613, "reward_std": 0.2102624773979187, "rewards/progression_diversity/mean": -0.04993094503879547, "rewards/progression_diversity/std": 0.1373254358768463, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.8860677480697632, "rewards/symbolic_reward_partial_score/std": 0.28898024559020996, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0196648836135864, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 6.12931489944458, "step": 1341 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2928446978330612, "epoch": 3.531578947368421, "grad_norm": 0.5019714832305908, "learning_rate": 1e-06, "loss": 0.224, "step": 1342 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3110320568084717, "epoch": 3.5342105263157895, "grad_norm": 0.13324137032032013, "learning_rate": 1e-06, "loss": 0.1591, "step": 1343 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.31039758026599884, "epoch": 3.536842105263158, "grad_norm": 0.014469148591160774, "learning_rate": 1e-06, "loss": 0.2039, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 2396.134765625, "completions/mean_terminated_length": 643.808837890625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.33016908168792725, "epoch": 3.5394736842105265, "frac_reward_zero_std": 0.25, "grad_norm": 333.5965576171875, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 444689480.0, "reward": 0.7707592844963074, "reward_std": 0.21156515181064606, "rewards/progression_diversity/mean": -0.04321339726448059, "rewards/progression_diversity/std": 0.1244661882519722, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.9091796875, "rewards/symbolic_reward_partial_score/std": 0.25746801495552063, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.030545949935913, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.220053672790527, "step": 1345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3137245178222656, "epoch": 3.542105263157895, "grad_norm": 0.03173833340406418, "learning_rate": 1e-06, "loss": 0.1528, "step": 1346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3061438798904419, "epoch": 3.544736842105263, "grad_norm": 0.012165974825620651, "learning_rate": 1e-06, "loss": 0.1486, "step": 1347 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3039957582950592, "epoch": 3.5473684210526315, "grad_norm": 0.023023858666419983, "learning_rate": 1e-06, "loss": 0.2288, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9747.0, "completions/mean_length": 2442.896484375, "completions/mean_terminated_length": 627.1677856445312, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.31957219541072845, "epoch": 3.55, "frac_reward_zero_std": 0.21875, "grad_norm": 117.19609069824219, "learning_rate": 1e-06, "loss": 0.1076, "num_tokens": 446334707.0, "reward": 0.7665076851844788, "reward_std": 0.2067509889602661, "rewards/progression_diversity/mean": -0.04357065260410309, "rewards/progression_diversity/std": 0.12135568261146545, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.90087890625, "rewards/symbolic_reward_partial_score/std": 0.2657473385334015, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0276845693588257, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 5.852321624755859, "step": 1349 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.30125513672828674, "epoch": 3.5526315789473686, "grad_norm": 0.008969382382929325, "learning_rate": 1e-06, "loss": 0.2238, "step": 1350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.300917848944664, "epoch": 3.555263157894737, "grad_norm": 0.008844579569995403, "learning_rate": 1e-06, "loss": 0.1436, "step": 1351 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.28367167711257935, "epoch": 3.557894736842105, "grad_norm": 0.01489369384944439, "learning_rate": 1e-06, "loss": 0.2285, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15503.0, "completions/mean_length": 2874.439453125, "completions/mean_terminated_length": 699.4308471679688, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.29677602648735046, "epoch": 3.5605263157894735, "frac_reward_zero_std": 0.0625, "grad_norm": 187.5312957763672, "learning_rate": 1e-06, "loss": 0.1843, "num_tokens": 448214740.0, "reward": 0.7094203233718872, "reward_std": 0.2542072832584381, "rewards/progression_diversity/mean": -0.04918673634529114, "rewards/progression_diversity/std": 0.1233910620212555, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.8396809697151184, "rewards/symbolic_reward_partial_score/std": 0.3393942713737488, "rewards/tag_count_reward/mean": -0.119140625, "rewards/tag_count_reward/std": 0.32427072525024414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0222926139831543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 308.0, "sampling/sampling_logp_difference/mean": 6.464677810668945, "step": 1353 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.30209778249263763, "epoch": 3.5631578947368423, "grad_norm": 0.01030214224010706, "learning_rate": 1e-06, "loss": 0.1664, "step": 1354 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.2921713888645172, "epoch": 3.5657894736842106, "grad_norm": 0.3698064982891083, "learning_rate": 1e-06, "loss": 0.2283, "step": 1355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.453125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.2947758883237839, "epoch": 3.568421052631579, "grad_norm": 0.0077121201902627945, "learning_rate": 1e-06, "loss": 0.2436, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10491.0, "completions/mean_length": 2377.16796875, "completions/mean_terminated_length": 622.4659423828125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.30925987660884857, "epoch": 3.5710526315789473, "frac_reward_zero_std": 0.15625, "grad_norm": 539.5180053710938, "learning_rate": 1e-06, "loss": 0.1286, "num_tokens": 449834314.0, "reward": 0.7453286647796631, "reward_std": 0.21328219771385193, "rewards/progression_diversity/mean": -0.03745080530643463, "rewards/progression_diversity/std": 0.10799506306648254, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.8756510019302368, "rewards/symbolic_reward_partial_score/std": 0.29454636573791504, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.033726692199707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 5.676729202270508, "step": 1357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.31313398480415344, "epoch": 3.5736842105263156, "grad_norm": 0.014206201769411564, "learning_rate": 1e-06, "loss": 0.1543, "step": 1358 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2981453835964203, "epoch": 3.5763157894736843, "grad_norm": 0.24512311816215515, "learning_rate": 1e-06, "loss": 0.2296, "step": 1359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.3176209479570389, "epoch": 3.5789473684210527, "grad_norm": 0.02592810057103634, "learning_rate": 1e-06, "loss": 0.2541, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14099.0, "completions/mean_length": 2200.619140625, "completions/mean_terminated_length": 631.5249633789062, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.320136159658432, "epoch": 3.581578947368421, "frac_reward_zero_std": 0.28125, "grad_norm": 498.01348876953125, "learning_rate": 1e-06, "loss": 0.1276, "num_tokens": 451356839.0, "reward": 0.7508112192153931, "reward_std": 0.2062763273715973, "rewards/progression_diversity/mean": -0.031185226514935493, "rewards/progression_diversity/std": 0.09589832276105881, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.8839517831802368, "rewards/symbolic_reward_partial_score/std": 0.2886301875114441, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0460567474365234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 4.3595733642578125, "step": 1361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3112920671701431, "epoch": 3.5842105263157897, "grad_norm": 0.022203197702765465, "learning_rate": 1e-06, "loss": 0.218, "step": 1362 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31792403757572174, "epoch": 3.586842105263158, "grad_norm": 0.020310793071985245, "learning_rate": 1e-06, "loss": 0.1697, "step": 1363 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3065248429775238, "epoch": 3.5894736842105264, "grad_norm": 0.00812375359237194, "learning_rate": 1e-06, "loss": 0.1274, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 11008.0, "completions/mean_length": 2851.1796875, "completions/mean_terminated_length": 672.4263305664062, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.2910042405128479, "epoch": 3.5921052631578947, "frac_reward_zero_std": 0.15625, "grad_norm": 208.19349670410156, "learning_rate": 1e-06, "loss": 0.2613, "num_tokens": 453229795.0, "reward": 0.699255645275116, "reward_std": 0.22771009802818298, "rewards/progression_diversity/mean": -0.04025688022375107, "rewards/progression_diversity/std": 0.10407532006502151, "rewards/symbolic_reward_accuracy/mean": 0.76171875, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.8439127802848816, "rewards/symbolic_reward_partial_score/std": 0.31969720125198364, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0359622240066528, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 5.4447712898254395, "step": 1365 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2833098769187927, "epoch": 3.594736842105263, "grad_norm": 1.1845999956130981, "learning_rate": 1e-06, "loss": 0.1663, "step": 1366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2765520215034485, "epoch": 3.5973684210526313, "grad_norm": 4.952581882476807, "learning_rate": 1e-06, "loss": 0.2748, "step": 1367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2997213453054428, "epoch": 3.6, "grad_norm": 0.008390809409320354, "learning_rate": 1e-06, "loss": 0.1059, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.150390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 2958.927734375, "completions/mean_terminated_length": 582.53564453125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.2849253863096237, "epoch": 3.6026315789473684, "frac_reward_zero_std": 0.125, "grad_norm": 119.27400207519531, "learning_rate": 1e-06, "loss": 0.1874, "num_tokens": 455127902.0, "reward": 0.7533248662948608, "reward_std": 0.24902209639549255, "rewards/progression_diversity/mean": -0.048372894525527954, "rewards/progression_diversity/std": 0.11722088605165482, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.8759765625, "rewards/symbolic_reward_partial_score/std": 0.3028711676597595, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.029304027557373, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 6.3514299392700195, "step": 1369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.29380667209625244, "epoch": 3.6052631578947367, "grad_norm": 4.995346546173096, "learning_rate": 1e-06, "loss": 0.209, "step": 1370 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2862599343061447, "epoch": 3.6078947368421055, "grad_norm": 794.2086181640625, "learning_rate": 1e-06, "loss": 0.2462, "step": 1371 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.26471564173698425, "epoch": 3.610526315789474, "grad_norm": 0.02694852091372013, "learning_rate": 1e-06, "loss": 0.2141, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 3480.74609375, "completions/mean_terminated_length": 579.057373046875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.29130323231220245, "epoch": 3.613157894736842, "frac_reward_zero_std": 0.0625, "grad_norm": 67.4537353515625, "learning_rate": 1e-06, "loss": 0.1288, "num_tokens": 457293500.0, "reward": 0.7533182501792908, "reward_std": 0.19987264275550842, "rewards/progression_diversity/mean": -0.0636826902627945, "rewards/progression_diversity/std": 0.13695798814296722, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.8790690302848816, "rewards/symbolic_reward_partial_score/std": 0.2996877133846283, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0184675455093384, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 7.7314229011535645, "step": 1373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.24614471197128296, "epoch": 3.6157894736842104, "grad_norm": 132.7631378173828, "learning_rate": 1e-06, "loss": 0.2735, "step": 1374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.23733964562416077, "epoch": 3.6184210526315788, "grad_norm": 0.007347916718572378, "learning_rate": 1e-06, "loss": 0.2669, "step": 1375 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.262113094329834, "epoch": 3.6210526315789475, "grad_norm": 0.00864808913320303, "learning_rate": 1e-06, "loss": 0.2765, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 2676.58984375, "completions/mean_terminated_length": 577.2567749023438, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.2994058430194855, "epoch": 3.623684210526316, "frac_reward_zero_std": 0.1875, "grad_norm": 173.56097412109375, "learning_rate": 1e-06, "loss": 0.1616, "num_tokens": 459061546.0, "reward": 0.8030776977539062, "reward_std": 0.17727278172969818, "rewards/progression_diversity/mean": -0.04867924749851227, "rewards/progression_diversity/std": 0.1260039210319519, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9161783456802368, "rewards/symbolic_reward_partial_score/std": 0.2622043490409851, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0174307823181152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 312.0, "sampling/sampling_logp_difference/mean": 7.301526069641113, "step": 1377 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2790711969137192, "epoch": 3.626315789473684, "grad_norm": 6.741792678833008, "learning_rate": 1e-06, "loss": 0.3263, "step": 1378 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.28607361018657684, "epoch": 3.6289473684210525, "grad_norm": 0.06569147855043411, "learning_rate": 1e-06, "loss": 0.1445, "step": 1379 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2860715389251709, "epoch": 3.6315789473684212, "grad_norm": 0.07871459424495697, "learning_rate": 1e-06, "loss": 0.1596, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9562.0, "completions/mean_length": 2981.08984375, "completions/mean_terminated_length": 644.802734375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.28075069189071655, "epoch": 3.6342105263157896, "frac_reward_zero_std": 0.1875, "grad_norm": 278.2060546875, "learning_rate": 1e-06, "loss": 0.1905, "num_tokens": 460993176.0, "reward": 0.6964207887649536, "reward_std": 0.240301251411438, "rewards/progression_diversity/mean": -0.0503075085580349, "rewards/progression_diversity/std": 0.12393683940172195, "rewards/symbolic_reward_accuracy/mean": 0.767578125, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.82958984375, "rewards/symbolic_reward_partial_score/std": 0.34416452050209045, "rewards/tag_count_reward/mean": -0.125, "rewards/tag_count_reward/std": 0.3310423493385315, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0247937440872192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 314.0, "sampling/sampling_logp_difference/mean": 6.386007308959961, "step": 1381 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.28298284113407135, "epoch": 3.636842105263158, "grad_norm": 0.013535849750041962, "learning_rate": 1e-06, "loss": 0.2295, "step": 1382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2968677282333374, "epoch": 3.639473684210526, "grad_norm": 0.01233405340462923, "learning_rate": 1e-06, "loss": 0.1708, "step": 1383 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2959662228822708, "epoch": 3.6421052631578945, "grad_norm": 0.6275126934051514, "learning_rate": 1e-06, "loss": 0.1565, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12655.0, "completions/mean_length": 2703.05078125, "completions/mean_terminated_length": 713.6510009765625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.3054865449666977, "epoch": 3.6447368421052633, "frac_reward_zero_std": 0.03125, "grad_norm": 223.1032257080078, "learning_rate": 1e-06, "loss": 0.1583, "num_tokens": 462803986.0, "reward": 0.7047585844993591, "reward_std": 0.28779661655426025, "rewards/progression_diversity/mean": -0.046601302921772, "rewards/progression_diversity/std": 0.12478232383728027, "rewards/symbolic_reward_accuracy/mean": 0.771484375, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.8448892831802368, "rewards/symbolic_reward_partial_score/std": 0.32900720834732056, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0184571743011475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 316.0, "sampling/sampling_logp_difference/mean": 7.270891189575195, "step": 1385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.29785533249378204, "epoch": 3.6473684210526316, "grad_norm": 0.015050256624817848, "learning_rate": 1e-06, "loss": 0.229, "step": 1386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2764730453491211, "epoch": 3.65, "grad_norm": 0.25503814220428467, "learning_rate": 1e-06, "loss": 0.2498, "step": 1387 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.290697917342186, "epoch": 3.6526315789473687, "grad_norm": 4.086553573608398, "learning_rate": 1e-06, "loss": 0.2406, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 2599.453125, "completions/mean_terminated_length": 630.232177734375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.3037455379962921, "epoch": 3.655263157894737, "frac_reward_zero_std": 0.0625, "grad_norm": 311.0882873535156, "learning_rate": 1e-06, "loss": 0.1859, "num_tokens": 464557082.0, "reward": 0.7203487157821655, "reward_std": 0.24300315976142883, "rewards/progression_diversity/mean": -0.04520959407091141, "rewards/progression_diversity/std": 0.12297887355089188, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.8623046875, "rewards/symbolic_reward_partial_score/std": 0.31785058975219727, "rewards/tag_count_reward/mean": -0.11328125, "rewards/tag_count_reward/std": 0.3172462284564972, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0264785289764404, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 318.0, "sampling/sampling_logp_difference/mean": 6.508614540100098, "step": 1389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4453125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.30377230048179626, "epoch": 3.6578947368421053, "grad_norm": 0.015176265500485897, "learning_rate": 1e-06, "loss": 0.1212, "step": 1390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.3056159168481827, "epoch": 3.6605263157894736, "grad_norm": 0.1255595088005066, "learning_rate": 1e-06, "loss": 0.2117, "step": 1391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.29334986209869385, "epoch": 3.663157894736842, "grad_norm": 0.023698071017861366, "learning_rate": 1e-06, "loss": 0.2919, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1780.5703125, "completions/mean_terminated_length": 609.8311767578125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.32029321789741516, "epoch": 3.6657894736842103, "frac_reward_zero_std": 0.375, "grad_norm": 217.3709259033203, "learning_rate": 1e-06, "loss": 0.1288, "num_tokens": 465858846.0, "reward": 0.7777209281921387, "reward_std": 0.20324808359146118, "rewards/progression_diversity/mean": -0.025765735656023026, "rewards/progression_diversity/std": 0.09416744112968445, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.90771484375, "rewards/symbolic_reward_partial_score/std": 0.2606476843357086, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482451915740967, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 320.0, "sampling/sampling_logp_difference/mean": 4.223204612731934, "step": 1393 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31482554972171783, "epoch": 3.668421052631579, "grad_norm": 2.6809914112091064, "learning_rate": 1e-06, "loss": 0.0911, "step": 1394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3331180214881897, "epoch": 3.6710526315789473, "grad_norm": 0.01906052976846695, "learning_rate": 1e-06, "loss": 0.1226, "step": 1395 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.32584846019744873, "epoch": 3.6736842105263157, "grad_norm": 60.955745697021484, "learning_rate": 1e-06, "loss": 0.1089, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2274.0, "completions/mean_length": 2306.021484375, "completions/mean_terminated_length": 646.1724853515625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.3163197785615921, "epoch": 3.6763157894736844, "frac_reward_zero_std": 0.28125, "grad_norm": 166.03375244140625, "learning_rate": 1e-06, "loss": 0.1648, "num_tokens": 467440009.0, "reward": 0.7467856407165527, "reward_std": 0.22565452754497528, "rewards/progression_diversity/mean": -0.038237735629081726, "rewards/progression_diversity/std": 0.11381914466619492, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, "rewards/symbolic_reward_partial_score/std": 0.2920258939266205, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.042597770690918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 320.0, "sampling/sampling_logp_difference/mean": 5.3379316329956055, "step": 1397 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31748393177986145, "epoch": 3.6789473684210527, "grad_norm": 256.6326599121094, "learning_rate": 1e-06, "loss": 0.1745, "step": 1398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.30522364377975464, "epoch": 3.681578947368421, "grad_norm": 2.4348554611206055, "learning_rate": 1e-06, "loss": 0.1645, "step": 1399 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30944982171058655, "epoch": 3.6842105263157894, "grad_norm": 3.376948356628418, "learning_rate": 1e-06, "loss": 0.1815, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 1631.30078125, "completions/mean_terminated_length": 581.9456176757812, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.3202086091041565, "epoch": 3.6868421052631577, "frac_reward_zero_std": 0.40625, "grad_norm": 520.1287841796875, "learning_rate": 1e-06, "loss": 0.1837, "num_tokens": 468665507.0, "reward": 0.823440432548523, "reward_std": 0.1710093915462494, "rewards/progression_diversity/mean": -0.024127831682562828, "rewards/progression_diversity/std": 0.09204965829849243, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.93310546875, "rewards/symbolic_reward_partial_score/std": 0.2400180548429489, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0514576435089111, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 322.0, "sampling/sampling_logp_difference/mean": 4.249440670013428, "step": 1401 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.33165352046489716, "epoch": 3.6894736842105265, "grad_norm": 0.17460408806800842, "learning_rate": 1e-06, "loss": 0.1594, "step": 1402 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3367105722427368, "epoch": 3.692105263157895, "grad_norm": 0.006330076605081558, "learning_rate": 1e-06, "loss": 0.1336, "step": 1403 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3383978605270386, "epoch": 3.694736842105263, "grad_norm": 0.006822290364652872, "learning_rate": 1e-06, "loss": 0.0894, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 2382.189453125, "completions/mean_terminated_length": 628.1165161132812, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.30853356420993805, "epoch": 3.6973684210526314, "frac_reward_zero_std": 0.3125, "grad_norm": 92.30716705322266, "learning_rate": 1e-06, "loss": 0.0998, "num_tokens": 470292484.0, "reward": 0.7492548823356628, "reward_std": 0.22711637616157532, "rewards/progression_diversity/mean": -0.04033348336815834, "rewards/progression_diversity/std": 0.11694171279668808, "rewards/symbolic_reward_accuracy/mean": 0.828125, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.8758138418197632, "rewards/symbolic_reward_partial_score/std": 0.30300623178482056, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0440869331359863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 324.0, "sampling/sampling_logp_difference/mean": 4.415769100189209, "step": 1405 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3130802512168884, "epoch": 3.7, "grad_norm": 4.948612689971924, "learning_rate": 1e-06, "loss": 0.1542, "step": 1406 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2954626977443695, "epoch": 3.7026315789473685, "grad_norm": 214.0167694091797, "learning_rate": 1e-06, "loss": 0.2376, "step": 1407 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.29514195024967194, "epoch": 3.705263157894737, "grad_norm": 0.019930332899093628, "learning_rate": 1e-06, "loss": 0.1884, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 1681.869140625, "completions/mean_terminated_length": 569.9432983398438, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3324284106492996, "epoch": 3.707894736842105, "frac_reward_zero_std": 0.21875, "grad_norm": 933.0283203125, "learning_rate": 1e-06, "loss": 0.1402, "num_tokens": 471541729.0, "reward": 0.8024474382400513, "reward_std": 0.21313899755477905, "rewards/progression_diversity/mean": -0.028701424598693848, "rewards/progression_diversity/std": 0.10557378083467484, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9270833134651184, "rewards/symbolic_reward_partial_score/std": 0.2357022613286972, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0388381481170654, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 324.0, "sampling/sampling_logp_difference/mean": 4.987360954284668, "step": 1409 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3176589757204056, "epoch": 3.7105263157894735, "grad_norm": 0.014979632571339607, "learning_rate": 1e-06, "loss": 0.174, "step": 1410 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.32269425690174103, "epoch": 3.713157894736842, "grad_norm": 0.00700529245659709, "learning_rate": 1e-06, "loss": 0.1423, "step": 1411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3213765025138855, "epoch": 3.7157894736842105, "grad_norm": 0.2787642180919647, "learning_rate": 1e-06, "loss": 0.1043, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 2176.283203125, "completions/mean_terminated_length": 638.6514892578125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.31092438101768494, "epoch": 3.718421052631579, "frac_reward_zero_std": 0.1875, "grad_norm": 331.7125244140625, "learning_rate": 1e-06, "loss": 0.171, "num_tokens": 473063122.0, "reward": 0.7549986839294434, "reward_std": 0.18977715075016022, "rewards/progression_diversity/mean": -0.037239447236061096, "rewards/progression_diversity/std": 0.11695530265569687, "rewards/symbolic_reward_accuracy/mean": 0.826171875, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.8948567509651184, "rewards/symbolic_reward_partial_score/std": 0.2710212469100952, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0411852598190308, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 326.0, "sampling/sampling_logp_difference/mean": 4.672489166259766, "step": 1413 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.29592007398605347, "epoch": 3.7210526315789476, "grad_norm": 225.3509979248047, "learning_rate": 1e-06, "loss": 0.211, "step": 1414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.3040802776813507, "epoch": 3.723684210526316, "grad_norm": 0.005704566836357117, "learning_rate": 1e-06, "loss": 0.1706, "step": 1415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.4140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.33103369176387787, "epoch": 3.7263157894736842, "grad_norm": 0.011854211799800396, "learning_rate": 1e-06, "loss": 0.0719, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1917.099609375, "completions/mean_terminated_length": 590.7100219726562, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.295977458357811, "epoch": 3.7289473684210526, "frac_reward_zero_std": 0.21875, "grad_norm": 196.44320678710938, "learning_rate": 1e-06, "loss": 0.2175, "num_tokens": 474441989.0, "reward": 0.7707229852676392, "reward_std": 0.23186513781547546, "rewards/progression_diversity/mean": -0.03708141669631004, "rewards/progression_diversity/std": 0.12436394393444061, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.8990885019302368, "rewards/symbolic_reward_partial_score/std": 0.27446648478507996, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0268856287002563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 328.0, "sampling/sampling_logp_difference/mean": 5.979172229766846, "step": 1417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.31002572178840637, "epoch": 3.731578947368421, "grad_norm": 1676.92724609375, "learning_rate": 1e-06, "loss": 0.4721, "step": 1418 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3273525983095169, "epoch": 3.734210526315789, "grad_norm": 0.009710345417261124, "learning_rate": 1e-06, "loss": 0.0811, "step": 1419 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3278929442167282, "epoch": 3.736842105263158, "grad_norm": 0.014435559511184692, "learning_rate": 1e-06, "loss": 0.145, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 2402.435546875, "completions/mean_terminated_length": 616.2444458007812, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.308517649769783, "epoch": 3.7394736842105263, "frac_reward_zero_std": 0.1875, "grad_norm": 69.96543884277344, "learning_rate": 1e-06, "loss": 0.1305, "num_tokens": 476068004.0, "reward": 0.755204439163208, "reward_std": 0.24294275045394897, "rewards/progression_diversity/mean": -0.050845738500356674, "rewards/progression_diversity/std": 0.14557471871376038, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.88037109375, "rewards/symbolic_reward_partial_score/std": 0.296750545501709, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0160291194915771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 328.0, "sampling/sampling_logp_difference/mean": 7.112195014953613, "step": 1421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.29520660638809204, "epoch": 3.7421052631578946, "grad_norm": 1305.361083984375, "learning_rate": 1e-06, "loss": 0.298, "step": 1422 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2944742292165756, "epoch": 3.7447368421052634, "grad_norm": 1.5552692413330078, "learning_rate": 1e-06, "loss": 0.1569, "step": 1423 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2920891344547272, "epoch": 3.7473684210526317, "grad_norm": 0.0419452041387558, "learning_rate": 1e-06, "loss": 0.2029, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 2013.22265625, "completions/mean_terminated_length": 594.6480712890625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.30936600267887115, "epoch": 3.75, "frac_reward_zero_std": 0.4375, "grad_norm": 129.8343048095703, "learning_rate": 1e-06, "loss": 0.0752, "num_tokens": 477496566.0, "reward": 0.7965672016143799, "reward_std": 0.15301448106765747, "rewards/progression_diversity/mean": -0.03566431626677513, "rewards/progression_diversity/std": 0.11889361590147018, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9246419072151184, "rewards/symbolic_reward_partial_score/std": 0.23886118829250336, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0362024307250977, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 330.0, "sampling/sampling_logp_difference/mean": 4.878294467926025, "step": 1425 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.32300904393196106, "epoch": 3.7526315789473683, "grad_norm": 0.02335791103541851, "learning_rate": 1e-06, "loss": 0.1367, "step": 1426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31392285227775574, "epoch": 3.7552631578947366, "grad_norm": 0.0137960035353899, "learning_rate": 1e-06, "loss": 0.1808, "step": 1427 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3122747987508774, "epoch": 3.7578947368421054, "grad_norm": 0.0074070231057703495, "learning_rate": 1e-06, "loss": 0.1176, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11201.0, "completions/mean_length": 2229.384765625, "completions/mean_terminated_length": 594.9738159179688, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.3264199197292328, "epoch": 3.7605263157894737, "frac_reward_zero_std": 0.3125, "grad_norm": 201.52215576171875, "learning_rate": 1e-06, "loss": 0.0746, "num_tokens": 479012763.0, "reward": 0.7792283296585083, "reward_std": 0.16983628273010254, "rewards/progression_diversity/mean": -0.041039757430553436, "rewards/progression_diversity/std": 0.12305484712123871, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.91064453125, "rewards/symbolic_reward_partial_score/std": 0.25674015283584595, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0295979976654053, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 332.0, "sampling/sampling_logp_difference/mean": 6.160717964172363, "step": 1429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30331704020500183, "epoch": 3.763157894736842, "grad_norm": 0.012537804432213306, "learning_rate": 1e-06, "loss": 0.1338, "step": 1430 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3093401789665222, "epoch": 3.765789473684211, "grad_norm": 0.014041773974895477, "learning_rate": 1e-06, "loss": 0.1339, "step": 1431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.29702627658843994, "epoch": 3.768421052631579, "grad_norm": 0.010763847269117832, "learning_rate": 1e-06, "loss": 0.2351, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 2045.2734375, "completions/mean_terminated_length": 629.8626708984375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.32291169464588165, "epoch": 3.7710526315789474, "frac_reward_zero_std": 0.21875, "grad_norm": 403.7364807128906, "learning_rate": 1e-06, "loss": 0.1425, "num_tokens": 480464935.0, "reward": 0.7736941576004028, "reward_std": 0.2378438413143158, "rewards/progression_diversity/mean": -0.03292561694979668, "rewards/progression_diversity/std": 0.10725618153810501, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9016926884651184, "rewards/symbolic_reward_partial_score/std": 0.265513151884079, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.034686803817749, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 332.0, "sampling/sampling_logp_difference/mean": 5.4915852546691895, "step": 1433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3211905360221863, "epoch": 3.7736842105263158, "grad_norm": 0.015537317842245102, "learning_rate": 1e-06, "loss": 0.1144, "step": 1434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.33275802433490753, "epoch": 3.776315789473684, "grad_norm": 0.014047092758119106, "learning_rate": 1e-06, "loss": 0.117, "step": 1435 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3044770658016205, "epoch": 3.7789473684210524, "grad_norm": 0.010970460250973701, "learning_rate": 1e-06, "loss": 0.1902, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12968.0, "completions/mean_length": 3075.37890625, "completions/mean_terminated_length": 683.5068969726562, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.2942586690187454, "epoch": 3.781578947368421, "frac_reward_zero_std": 0.125, "grad_norm": 349.7894287109375, "learning_rate": 1e-06, "loss": 0.1484, "num_tokens": 482467017.0, "reward": 0.6715598702430725, "reward_std": 0.2547609806060791, "rewards/progression_diversity/mean": -0.05104883760213852, "rewards/progression_diversity/std": 0.12435862421989441, "rewards/symbolic_reward_accuracy/mean": 0.732421875, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.8229166269302368, "rewards/symbolic_reward_partial_score/std": 0.34593749046325684, "rewards/tag_count_reward/mean": -0.142578125, "rewards/tag_count_reward/std": 0.3499840497970581, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0357654094696045, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 332.0, "sampling/sampling_logp_difference/mean": 5.477702617645264, "step": 1437 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.27393701672554016, "epoch": 3.7842105263157895, "grad_norm": 0.00826684758067131, "learning_rate": 1e-06, "loss": 0.2608, "step": 1438 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.3037470281124115, "epoch": 3.786842105263158, "grad_norm": 0.006643475033342838, "learning_rate": 1e-06, "loss": 0.1667, "step": 1439 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.28216809034347534, "epoch": 3.7894736842105265, "grad_norm": 0.01340513676404953, "learning_rate": 1e-06, "loss": 0.2354, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 1698.798828125, "completions/mean_terminated_length": 588.1533813476562, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.33806556463241577, "epoch": 3.792105263157895, "frac_reward_zero_std": 0.34375, "grad_norm": 164.4355926513672, "learning_rate": 1e-06, "loss": 0.0763, "num_tokens": 483743938.0, "reward": 0.780125617980957, "reward_std": 0.18881092965602875, "rewards/progression_diversity/mean": -0.02455180510878563, "rewards/progression_diversity/std": 0.09061013907194138, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.9208984375, "rewards/symbolic_reward_partial_score/std": 0.2368720918893814, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0537469387054443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 332.0, "sampling/sampling_logp_difference/mean": 3.5584030151367188, "step": 1441 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31358209252357483, "epoch": 3.794736842105263, "grad_norm": 0.010208331979811192, "learning_rate": 1e-06, "loss": 0.1884, "step": 1442 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3256896734237671, "epoch": 3.7973684210526315, "grad_norm": 0.05766785889863968, "learning_rate": 1e-06, "loss": 0.0753, "step": 1443 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.32447347044944763, "epoch": 3.8, "grad_norm": 0.014500455930829048, "learning_rate": 1e-06, "loss": 0.1182, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 2110.767578125, "completions/mean_terminated_length": 566.0454711914062, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.30462731420993805, "epoch": 3.8026315789473686, "frac_reward_zero_std": 0.21875, "grad_norm": 307.380615234375, "learning_rate": 1e-06, "loss": 0.1169, "num_tokens": 485208267.0, "reward": 0.7762401700019836, "reward_std": 0.18577754497528076, "rewards/progression_diversity/mean": -0.032237473875284195, "rewards/progression_diversity/std": 0.10042642056941986, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9088541269302368, "rewards/symbolic_reward_partial_score/std": 0.2546987235546112, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.053719401359558, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 336.0, "sampling/sampling_logp_difference/mean": 3.9605154991149902, "step": 1445 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3047986924648285, "epoch": 3.805263157894737, "grad_norm": 0.018491854891180992, "learning_rate": 1e-06, "loss": 0.1968, "step": 1446 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3292408138513565, "epoch": 3.807894736842105, "grad_norm": 2.424530267715454, "learning_rate": 1e-06, "loss": 0.0355, "step": 1447 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2958214730024338, "epoch": 3.8105263157894735, "grad_norm": 0.017129186540842056, "learning_rate": 1e-06, "loss": 0.2069, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16038.0, "completions/mean_length": 2222.494140625, "completions/mean_terminated_length": 621.6282348632812, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.3045518100261688, "epoch": 3.8131578947368423, "frac_reward_zero_std": 0.25, "grad_norm": 98.93772888183594, "learning_rate": 1e-06, "loss": 0.1575, "num_tokens": 486730280.0, "reward": 0.7752275466918945, "reward_std": 0.1814824491739273, "rewards/progression_diversity/mean": -0.03583987057209015, "rewards/progression_diversity/std": 0.10934660583734512, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.8990885615348816, "rewards/symbolic_reward_partial_score/std": 0.27117884159088135, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0510573387145996, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 336.0, "sampling/sampling_logp_difference/mean": 4.26395320892334, "step": 1449 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31291763484477997, "epoch": 3.8157894736842106, "grad_norm": 20.84035301208496, "learning_rate": 1e-06, "loss": 0.1215, "step": 1450 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.30178968608379364, "epoch": 3.818421052631579, "grad_norm": 0.08378605544567108, "learning_rate": 1e-06, "loss": 0.1589, "step": 1451 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.30265994369983673, "epoch": 3.8210526315789473, "grad_norm": 0.11986826360225677, "learning_rate": 1e-06, "loss": 0.1607, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10509.0, "completions/mean_length": 2041.810546875, "completions/mean_terminated_length": 592.169921875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.3079294115304947, "epoch": 3.8236842105263156, "frac_reward_zero_std": 0.21875, "grad_norm": 155.3240509033203, "learning_rate": 1e-06, "loss": 0.1091, "num_tokens": 488187015.0, "reward": 0.7332264184951782, "reward_std": 0.21755467355251312, "rewards/progression_diversity/mean": -0.03185119852423668, "rewards/progression_diversity/std": 0.10352633893489838, "rewards/symbolic_reward_accuracy/mean": 0.791015625, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.8904622197151184, "rewards/symbolic_reward_partial_score/std": 0.26156580448150635, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0546609163284302, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 336.0, "sampling/sampling_logp_difference/mean": 4.185214042663574, "step": 1453 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3079414367675781, "epoch": 3.8263157894736843, "grad_norm": 0.01617673970758915, "learning_rate": 1e-06, "loss": 0.1361, "step": 1454 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.30259697139263153, "epoch": 3.8289473684210527, "grad_norm": 0.010684851557016373, "learning_rate": 1e-06, "loss": 0.1832, "step": 1455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29774773120880127, "epoch": 3.831578947368421, "grad_norm": 0.026969242841005325, "learning_rate": 1e-06, "loss": 0.1849, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15120.0, "completions/mean_length": 2334.595703125, "completions/mean_terminated_length": 609.2302856445312, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.28709380328655243, "epoch": 3.8342105263157897, "frac_reward_zero_std": 0.1875, "grad_norm": 261.2757568359375, "learning_rate": 1e-06, "loss": 0.273, "num_tokens": 489779800.0, "reward": 0.741887092590332, "reward_std": 0.2086125761270523, "rewards/progression_diversity/mean": -0.039813414216041565, "rewards/progression_diversity/std": 0.11546120047569275, "rewards/symbolic_reward_accuracy/mean": 0.814453125, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.8785807490348816, "rewards/symbolic_reward_partial_score/std": 0.2908572554588318, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0556657314300537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 340.0, "sampling/sampling_logp_difference/mean": 5.175328731536865, "step": 1457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3067042976617813, "epoch": 3.836842105263158, "grad_norm": 0.011344866827130318, "learning_rate": 1e-06, "loss": 0.1358, "step": 1458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3160387873649597, "epoch": 3.8394736842105264, "grad_norm": 2.8747458457946777, "learning_rate": 1e-06, "loss": 0.0778, "step": 1459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2950311750173569, "epoch": 3.8421052631578947, "grad_norm": 0.005016846116632223, "learning_rate": 1e-06, "loss": 0.1839, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15092.0, "completions/mean_length": 1931.142578125, "completions/mean_terminated_length": 639.610595703125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.31487199664115906, "epoch": 3.844736842105263, "frac_reward_zero_std": 0.21875, "grad_norm": 225.66839599609375, "learning_rate": 1e-06, "loss": 0.1183, "num_tokens": 491169345.0, "reward": 0.7955160140991211, "reward_std": 0.22690117359161377, "rewards/progression_diversity/mean": -0.033369071781635284, "rewards/progression_diversity/std": 0.11018730700016022, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9132486581802368, "rewards/symbolic_reward_partial_score/std": 0.2617155611515045, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0568856000900269, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 340.0, "sampling/sampling_logp_difference/mean": 4.69308614730835, "step": 1461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31012292206287384, "epoch": 3.8473684210526313, "grad_norm": 0.009823931381106377, "learning_rate": 1e-06, "loss": 0.1637, "step": 1462 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2923455536365509, "epoch": 3.85, "grad_norm": 0.34986069798469543, "learning_rate": 1e-06, "loss": 0.2035, "step": 1463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3000039905309677, "epoch": 3.8526315789473684, "grad_norm": 0.01767495833337307, "learning_rate": 1e-06, "loss": 0.1856, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9351.0, "completions/mean_length": 1834.5859375, "completions/mean_terminated_length": 634.9513549804688, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.3126460313796997, "epoch": 3.8552631578947367, "frac_reward_zero_std": 0.28125, "grad_norm": 1094.819580078125, "learning_rate": 1e-06, "loss": 0.1071, "num_tokens": 492513805.0, "reward": 0.763613224029541, "reward_std": 0.17882443964481354, "rewards/progression_diversity/mean": -0.03028068132698536, "rewards/progression_diversity/std": 0.10648433864116669, "rewards/symbolic_reward_accuracy/mean": 0.828125, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.9070637822151184, "rewards/symbolic_reward_partial_score/std": 0.24986954033374786, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0567845106124878, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 340.0, "sampling/sampling_logp_difference/mean": 4.487025260925293, "step": 1465 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2963457703590393, "epoch": 3.8578947368421055, "grad_norm": 551.8005981445312, "learning_rate": 1e-06, "loss": 0.2159, "step": 1466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.30269500613212585, "epoch": 3.860526315789474, "grad_norm": 30.8673152923584, "learning_rate": 1e-06, "loss": 0.1849, "step": 1467 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31897950172424316, "epoch": 3.863157894736842, "grad_norm": 0.007765746209770441, "learning_rate": 1e-06, "loss": 0.0966, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14826.0, "completions/mean_length": 2125.513671875, "completions/mean_terminated_length": 650.4978637695312, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.2966301888227463, "epoch": 3.8657894736842104, "frac_reward_zero_std": 0.15625, "grad_norm": 295.4943542480469, "learning_rate": 1e-06, "loss": 0.2116, "num_tokens": 493999860.0, "reward": 0.7747255563735962, "reward_std": 0.2700809836387634, "rewards/progression_diversity/mean": -0.04209459200501442, "rewards/progression_diversity/std": 0.12887020409107208, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.89697265625, "rewards/symbolic_reward_partial_score/std": 0.2792568802833557, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0543975830078125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 342.0, "sampling/sampling_logp_difference/mean": 4.8566389083862305, "step": 1469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2987577021121979, "epoch": 3.8684210526315788, "grad_norm": 2.0050089359283447, "learning_rate": 1e-06, "loss": 0.125, "step": 1470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.31162601709365845, "epoch": 3.8710526315789475, "grad_norm": 0.4049341678619385, "learning_rate": 1e-06, "loss": 0.0939, "step": 1471 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.27791038155555725, "epoch": 3.873684210526316, "grad_norm": 2.2584075927734375, "learning_rate": 1e-06, "loss": 0.2615, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 2676.2734375, "completions/mean_terminated_length": 682.9798583984375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.2749413251876831, "epoch": 3.876315789473684, "frac_reward_zero_std": 0.21875, "grad_norm": 352.43115234375, "learning_rate": 1e-06, "loss": 0.1901, "num_tokens": 495803296.0, "reward": 0.6940611600875854, "reward_std": 0.22690893709659576, "rewards/progression_diversity/mean": -0.047005537897348404, "rewards/progression_diversity/std": 0.13115794956684113, "rewards/symbolic_reward_accuracy/mean": 0.75390625, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.845703125, "rewards/symbolic_reward_partial_score/std": 0.31764960289001465, "rewards/tag_count_reward/mean": -0.115234375, "rewards/tag_count_reward/std": 0.3196168541908264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0539484024047852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 342.0, "sampling/sampling_logp_difference/mean": 4.197028636932373, "step": 1473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2936839610338211, "epoch": 3.8789473684210525, "grad_norm": 0.012438047677278519, "learning_rate": 1e-06, "loss": 0.1267, "step": 1474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3024512231349945, "epoch": 3.8815789473684212, "grad_norm": 0.14518336951732635, "learning_rate": 1e-06, "loss": 0.1382, "step": 1475 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2862910181283951, "epoch": 3.8842105263157896, "grad_norm": 0.021601000800728798, "learning_rate": 1e-06, "loss": 0.2073, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5602.0, "completions/mean_length": 1790.48828125, "completions/mean_terminated_length": 653.73046875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.2987203449010849, "epoch": 3.886842105263158, "frac_reward_zero_std": 0.28125, "grad_norm": 451.8078308105469, "learning_rate": 1e-06, "loss": 0.2304, "num_tokens": 497139354.0, "reward": 0.7696309089660645, "reward_std": 0.18139323592185974, "rewards/progression_diversity/mean": -0.024213135242462158, "rewards/progression_diversity/std": 0.09224303066730499, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.8963216543197632, "rewards/symbolic_reward_partial_score/std": 0.2699549198150635, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065273404121399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 340.0, "sampling/sampling_logp_difference/mean": 2.8104472160339355, "step": 1477 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3077242374420166, "epoch": 3.889473684210526, "grad_norm": 75.5800552368164, "learning_rate": 1e-06, "loss": 0.1149, "step": 1478 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.32739219069480896, "epoch": 3.8921052631578945, "grad_norm": 0.022421207278966904, "learning_rate": 1e-06, "loss": 0.0204, "step": 1479 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3064194321632385, "epoch": 3.8947368421052633, "grad_norm": 0.005432396661490202, "learning_rate": 1e-06, "loss": 0.1348, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2170.0, "completions/mean_length": 2075.45703125, "completions/mean_terminated_length": 629.2172241210938, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.29528728127479553, "epoch": 3.8973684210526316, "frac_reward_zero_std": 0.34375, "grad_norm": 82.80768585205078, "learning_rate": 1e-06, "loss": 0.0844, "num_tokens": 498619812.0, "reward": 0.7649828791618347, "reward_std": 0.20222607254981995, "rewards/progression_diversity/mean": -0.034921687096357346, "rewards/progression_diversity/std": 0.11383773386478424, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9026692509651184, "rewards/symbolic_reward_partial_score/std": 0.26592451333999634, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0633716583251953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 342.0, "sampling/sampling_logp_difference/mean": 3.5869860649108887, "step": 1481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3259361535310745, "epoch": 3.9, "grad_norm": 0.030284911394119263, "learning_rate": 1e-06, "loss": 0.0843, "step": 1482 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2944328784942627, "epoch": 3.9026315789473687, "grad_norm": 0.011786861345171928, "learning_rate": 1e-06, "loss": 0.2573, "step": 1483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30443601310253143, "epoch": 3.905263157894737, "grad_norm": 0.02485356479883194, "learning_rate": 1e-06, "loss": 0.1844, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1519.84375, "completions/mean_terminated_length": 594.6888427734375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.32855215668678284, "epoch": 3.9078947368421053, "frac_reward_zero_std": 0.3125, "grad_norm": 51.70481491088867, "learning_rate": 1e-06, "loss": 0.1191, "num_tokens": 499803124.0, "reward": 0.8123822212219238, "reward_std": 0.18835322558879852, "rewards/progression_diversity/mean": -0.021549619734287262, "rewards/progression_diversity/std": 0.08892330527305603, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9287109375, "rewards/symbolic_reward_partial_score/std": 0.23190389573574066, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0671436786651611, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 342.0, "sampling/sampling_logp_difference/mean": 3.182962417602539, "step": 1485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.30832575261592865, "epoch": 3.9105263157894736, "grad_norm": 0.009262947365641594, "learning_rate": 1e-06, "loss": 0.1818, "step": 1486 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3187596797943115, "epoch": 3.913157894736842, "grad_norm": 0.00602568918839097, "learning_rate": 1e-06, "loss": 0.1425, "step": 1487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3167625367641449, "epoch": 3.9157894736842103, "grad_norm": 0.01127164252102375, "learning_rate": 1e-06, "loss": 0.1257, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 1434.26953125, "completions/mean_terminated_length": 569.4090576171875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.3199300616979599, "epoch": 3.918421052631579, "frac_reward_zero_std": 0.3125, "grad_norm": 47.4603157043457, "learning_rate": 1e-06, "loss": 0.1406, "num_tokens": 500914430.0, "reward": 0.8060013055801392, "reward_std": 0.2019416242837906, "rewards/progression_diversity/mean": -0.019987143576145172, "rewards/progression_diversity/std": 0.088285431265831, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9295247793197632, "rewards/symbolic_reward_partial_score/std": 0.22341255843639374, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701146125793457, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 344.0, "sampling/sampling_logp_difference/mean": 2.487426280975342, "step": 1489 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32358092069625854, "epoch": 3.9210526315789473, "grad_norm": 5.1159563064575195, "learning_rate": 1e-06, "loss": 0.074, "step": 1490 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3156203180551529, "epoch": 3.9236842105263157, "grad_norm": 19.548402786254883, "learning_rate": 1e-06, "loss": 0.0988, "step": 1491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.321828693151474, "epoch": 3.9263157894736844, "grad_norm": 95.19469451904297, "learning_rate": 1e-06, "loss": 0.1048, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9638.0, "completions/mean_length": 1489.11328125, "completions/mean_terminated_length": 594.8033447265625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.31046292185783386, "epoch": 3.9289473684210527, "frac_reward_zero_std": 0.34375, "grad_norm": 160.60206604003906, "learning_rate": 1e-06, "loss": 0.1243, "num_tokens": 502064824.0, "reward": 0.8175588250160217, "reward_std": 0.18224866688251495, "rewards/progression_diversity/mean": -0.02145996317267418, "rewards/progression_diversity/std": 0.08906258642673492, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9342448115348816, "rewards/symbolic_reward_partial_score/std": 0.2271050065755844, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0690562725067139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.968571662902832, "step": 1493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.33640483021736145, "epoch": 3.931578947368421, "grad_norm": 0.021926378831267357, "learning_rate": 1e-06, "loss": 0.0626, "step": 1494 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.29776348173618317, "epoch": 3.9342105263157894, "grad_norm": 0.009869229048490524, "learning_rate": 1e-06, "loss": 0.1971, "step": 1495 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.310922309756279, "epoch": 3.9368421052631577, "grad_norm": 0.004013043362647295, "learning_rate": 1e-06, "loss": 0.0687, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9479.0, "completions/mean_length": 1596.375, "completions/mean_terminated_length": 577.6033325195312, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.32028135657310486, "epoch": 3.9394736842105265, "frac_reward_zero_std": 0.375, "grad_norm": 302.3440856933594, "learning_rate": 1e-06, "loss": 0.1131, "num_tokens": 503281304.0, "reward": 0.7774491906166077, "reward_std": 0.17322488129138947, "rewards/progression_diversity/mean": -0.02363717183470726, "rewards/progression_diversity/std": 0.09269430488348007, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.91064453125, "rewards/symbolic_reward_partial_score/std": 0.2503076493740082, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702153444290161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 350.0, "sampling/sampling_logp_difference/mean": 2.403035879135132, "step": 1497 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.31212155520915985, "epoch": 3.942105263157895, "grad_norm": 0.007395219057798386, "learning_rate": 1e-06, "loss": 0.108, "step": 1498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30698293447494507, "epoch": 3.944736842105263, "grad_norm": 0.010542972013354301, "learning_rate": 1e-06, "loss": 0.1506, "step": 1499 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31951723992824554, "epoch": 3.9473684210526314, "grad_norm": 0.01814758963882923, "learning_rate": 1e-06, "loss": 0.0871, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9357.0, "completions/mean_length": 1486.361328125, "completions/mean_terminated_length": 624.5144653320312, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.32599151134490967, "epoch": 3.95, "frac_reward_zero_std": 0.40625, "grad_norm": 6.691247940063477, "learning_rate": 1e-06, "loss": 0.0441, "num_tokens": 504458641.0, "reward": 0.7741117477416992, "reward_std": 0.17628371715545654, "rewards/progression_diversity/mean": -0.02046554908156395, "rewards/progression_diversity/std": 0.08719975501298904, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.9143880605697632, "rewards/symbolic_reward_partial_score/std": 0.23354806005954742, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702029466629028, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.0351459980010986, "step": 1501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.32493776082992554, "epoch": 3.9526315789473685, "grad_norm": 1.328794240951538, "learning_rate": 1e-06, "loss": 0.0413, "step": 1502 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3139204680919647, "epoch": 3.955263157894737, "grad_norm": 0.007978091016411781, "learning_rate": 1e-06, "loss": 0.1326, "step": 1503 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3053445369005203, "epoch": 3.957894736842105, "grad_norm": 1.1158732175827026, "learning_rate": 1e-06, "loss": 0.1221, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 1384.337890625, "completions/mean_terminated_length": 614.334716796875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.30869564414024353, "epoch": 3.9605263157894735, "frac_reward_zero_std": 0.4375, "grad_norm": 96.13636016845703, "learning_rate": 1e-06, "loss": 0.1313, "num_tokens": 505584222.0, "reward": 0.8070886135101318, "reward_std": 0.16022410988807678, "rewards/progression_diversity/mean": -0.018680397421121597, "rewards/progression_diversity/std": 0.0852426066994667, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9318033456802368, "rewards/symbolic_reward_partial_score/std": 0.22796770930290222, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0674837827682495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.188668727874756, "step": 1505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3231370598077774, "epoch": 3.963157894736842, "grad_norm": 2.7390880584716797, "learning_rate": 1e-06, "loss": 0.0322, "step": 1506 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31320738792419434, "epoch": 3.9657894736842105, "grad_norm": 3.7349812984466553, "learning_rate": 1e-06, "loss": 0.1126, "step": 1507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.30574825406074524, "epoch": 3.968421052631579, "grad_norm": 0.009325175546109676, "learning_rate": 1e-06, "loss": 0.1404, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 1660.708984375, "completions/mean_terminated_length": 613.4456176757812, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.32026034593582153, "epoch": 3.9710526315789476, "frac_reward_zero_std": 0.21875, "grad_norm": 28.75910186767578, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 506864681.0, "reward": 0.7385144233703613, "reward_std": 0.21706028282642365, "rewards/progression_diversity/mean": -0.020627107471227646, "rewards/progression_diversity/std": 0.08610701560974121, "rewards/symbolic_reward_accuracy/mean": 0.791015625, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.9012044072151184, "rewards/symbolic_reward_partial_score/std": 0.24362309277057648, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06502366065979, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 346.0, "sampling/sampling_logp_difference/mean": 2.6898069381713867, "step": 1509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3142033517360687, "epoch": 3.973684210526316, "grad_norm": 171.92105102539062, "learning_rate": 1e-06, "loss": 0.0856, "step": 1510 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.28372959792613983, "epoch": 3.9763157894736842, "grad_norm": 300.263427734375, "learning_rate": 1e-06, "loss": 0.1618, "step": 1511 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3046766221523285, "epoch": 3.9789473684210526, "grad_norm": 0.013155936263501644, "learning_rate": 1e-06, "loss": 0.1099, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 1467.025390625, "completions/mean_terminated_length": 636.5958862304688, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.3126714378595352, "epoch": 3.981578947368421, "frac_reward_zero_std": 0.34375, "grad_norm": 230.4984588623047, "learning_rate": 1e-06, "loss": 0.1214, "num_tokens": 508025942.0, "reward": 0.7970232963562012, "reward_std": 0.1775650829076767, "rewards/progression_diversity/mean": -0.019354067742824554, "rewards/progression_diversity/std": 0.08400935679674149, "rewards/symbolic_reward_accuracy/mean": 0.873046875, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.9275715947151184, "rewards/symbolic_reward_partial_score/std": 0.22987112402915955, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0662436485290527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.509793281555176, "step": 1513 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.31753115355968475, "epoch": 3.984210526315789, "grad_norm": 0.017170730978250504, "learning_rate": 1e-06, "loss": 0.1007, "step": 1514 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2985118627548218, "epoch": 3.986842105263158, "grad_norm": 0.0552213080227375, "learning_rate": 1e-06, "loss": 0.127, "step": 1515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31124047935009, "epoch": 3.9894736842105263, "grad_norm": 0.018329912796616554, "learning_rate": 1e-06, "loss": 0.0722, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1958.474609375, "completions/mean_terminated_length": 635.8784790039062, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.285673588514328, "epoch": 3.9921052631578946, "frac_reward_zero_std": 0.28125, "grad_norm": 261.71087646484375, "learning_rate": 1e-06, "loss": 0.2431, "num_tokens": 509438505.0, "reward": 0.7900243997573853, "reward_std": 0.1837175190448761, "rewards/progression_diversity/mean": -0.03076472133398056, "rewards/progression_diversity/std": 0.10606426000595093, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9117838144302368, "rewards/symbolic_reward_partial_score/std": 0.26197776198387146, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0607240200042725, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 3.408790111541748, "step": 1517 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3019895553588867, "epoch": 3.9947368421052634, "grad_norm": 476.9878845214844, "learning_rate": 1e-06, "loss": 0.1528, "step": 1518 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.29304228723049164, "epoch": 3.9973684210526317, "grad_norm": 0.3619004487991333, "learning_rate": 1e-06, "loss": 0.1619, "step": 1519 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3019654005765915, "epoch": 4.0, "grad_norm": 0.011948428116738796, "learning_rate": 1e-06, "loss": 0.1182, "step": 1520 }, { "epoch": 4.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.034423828125, "eval_completions/max_length": 15897.96875, "eval_completions/max_terminated_length": 1909.625, "eval_completions/mean_length": 1054.68017578125, "eval_completions/mean_terminated_length": 508.42947006225586, "eval_completions/min_length": 225.25, "eval_completions/min_terminated_length": 225.25, "eval_entropy": 0.3078602785244584, "eval_frac_reward_zero_std": 0.30078125, "eval_loss": 0.03949446976184845, "eval_num_tokens": 509438505.0, "eval_reward": 0.8282011039555073, "eval_reward_std": 0.17971097235567868, "eval_rewards/progression_diversity/mean": -0.013755144311289769, "eval_rewards/progression_diversity/std": 0.07045977615052834, "eval_rewards/symbolic_reward_accuracy/mean": 0.91259765625, "eval_rewards/symbolic_reward_accuracy/std": 0.2780101113021374, "eval_rewards/symbolic_reward_partial_score/mean": 0.9448852557688951, "eval_rewards/symbolic_reward_partial_score/std": 0.1947934958152473, "eval_rewards/tag_count_reward/mean": -0.02685546875, "eval_rewards/tag_count_reward/std": 0.15172951598651707, "eval_runtime": 3921.274, "eval_samples_per_second": 0.064, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0713170282542706, "eval_sampling/importance_sampling_ratio/min": 2.523935052067827e-07, "eval_sampling/sampling_logp_difference/max": 336.4912216961384, "eval_sampling/sampling_logp_difference/mean": 1.0463141361251473, "eval_steps_per_second": 0.001, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 2191.412109375, "completions/mean_terminated_length": 621.2993774414062, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.27799971401691437, "epoch": 4.002631578947368, "frac_reward_zero_std": 0.28125, "grad_norm": 205.81446838378906, "learning_rate": 1e-06, "loss": 0.1941, "num_tokens": 510976988.0, "reward": 0.755181074142456, "reward_std": 0.1851288080215454, "rewards/progression_diversity/mean": -0.038541071116924286, "rewards/progression_diversity/std": 0.1202431246638298, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.8818359375, "rewards/symbolic_reward_partial_score/std": 0.2894873321056366, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.055989146232605, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 4.068850994110107, "step": 1521 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30569666624069214, "epoch": 4.005263157894737, "grad_norm": 0.4432198405265808, "learning_rate": 1e-06, "loss": 0.0885, "step": 1522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.28277716040611267, "epoch": 4.007894736842105, "grad_norm": 0.008685186505317688, "learning_rate": 1e-06, "loss": 0.1703, "step": 1523 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2912045568227768, "epoch": 4.010526315789473, "grad_norm": 0.018372351303696632, "learning_rate": 1e-06, "loss": 0.1348, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 1152.287109375, "completions/mean_terminated_length": 565.263671875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.32067057490348816, "epoch": 4.0131578947368425, "frac_reward_zero_std": 0.5, "grad_norm": 151.9108428955078, "learning_rate": 1e-06, "loss": 0.1036, "num_tokens": 511949071.0, "reward": 0.8358957767486572, "reward_std": 0.15225106477737427, "rewards/progression_diversity/mean": -0.013942432589828968, "rewards/progression_diversity/std": 0.07440164685249329, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.9501953125, "rewards/symbolic_reward_partial_score/std": 0.1953859031200409, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619966983795166, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.296659469604492, "step": 1525 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3325800895690918, "epoch": 4.015789473684211, "grad_norm": 0.02713238075375557, "learning_rate": 1e-06, "loss": 0.0424, "step": 1526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3067079782485962, "epoch": 4.018421052631579, "grad_norm": 0.009789610281586647, "learning_rate": 1e-06, "loss": 0.1041, "step": 1527 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3153266906738281, "epoch": 4.021052631578947, "grad_norm": 0.003507691202685237, "learning_rate": 1e-06, "loss": 0.0967, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1278.15234375, "completions/mean_terminated_length": 599.9306030273438, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.3092862367630005, "epoch": 4.023684210526316, "frac_reward_zero_std": 0.375, "grad_norm": 305.3227233886719, "learning_rate": 1e-06, "loss": 0.0672, "num_tokens": 513002301.0, "reward": 0.7656987905502319, "reward_std": 0.17219382524490356, "rewards/progression_diversity/mean": -0.017035705968737602, "rewards/progression_diversity/std": 0.08615429699420929, "rewards/symbolic_reward_accuracy/mean": 0.81640625, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.93115234375, "rewards/symbolic_reward_partial_score/std": 0.20142759382724762, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0569286346435547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.1248581409454346, "step": 1529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3113991618156433, "epoch": 4.026315789473684, "grad_norm": 58.646968841552734, "learning_rate": 1e-06, "loss": 0.1007, "step": 1530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3031221777200699, "epoch": 4.028947368421052, "grad_norm": 0.011349079199135303, "learning_rate": 1e-06, "loss": 0.1034, "step": 1531 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.32167190313339233, "epoch": 4.031578947368421, "grad_norm": 0.010632401332259178, "learning_rate": 1e-06, "loss": 0.0584, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 1502.974609375, "completions/mean_terminated_length": 576.7697143554688, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3021652102470398, "epoch": 4.03421052631579, "frac_reward_zero_std": 0.46875, "grad_norm": 165.5340576171875, "learning_rate": 1e-06, "loss": 0.1347, "num_tokens": 514159952.0, "reward": 0.8022516965866089, "reward_std": 0.11929187178611755, "rewards/progression_diversity/mean": -0.023855865001678467, "rewards/progression_diversity/std": 0.0998440608382225, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, "rewards/symbolic_reward_partial_score/std": 0.23110011219978333, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0544638633728027, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 2.4234862327575684, "step": 1533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3111570328474045, "epoch": 4.036842105263158, "grad_norm": 0.01784106157720089, "learning_rate": 1e-06, "loss": 0.1066, "step": 1534 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3170369863510132, "epoch": 4.0394736842105265, "grad_norm": 3.104100227355957, "learning_rate": 1e-06, "loss": 0.0767, "step": 1535 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3106560707092285, "epoch": 4.042105263157895, "grad_norm": 0.010589975863695145, "learning_rate": 1e-06, "loss": 0.1156, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 909.375, "completions/mean_terminated_length": 537.9840087890625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.33030684292316437, "epoch": 4.044736842105263, "frac_reward_zero_std": 0.4375, "grad_norm": 51.92629623413086, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 515009328.0, "reward": 0.8129271268844604, "reward_std": 0.14900250732898712, "rewards/progression_diversity/mean": -0.011006257496774197, "rewards/progression_diversity/std": 0.07234536856412888, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, "rewards/symbolic_reward_partial_score/std": 0.1690969616174698, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0632414817810059, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 1.5446823835372925, "step": 1537 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.32397404313087463, "epoch": 4.0473684210526315, "grad_norm": 0.022417036816477776, "learning_rate": 1e-06, "loss": 0.0486, "step": 1538 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3341270387172699, "epoch": 4.05, "grad_norm": 0.015315905213356018, "learning_rate": 1e-06, "loss": 0.0331, "step": 1539 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31841354072093964, "epoch": 4.052631578947368, "grad_norm": 61.291114807128906, "learning_rate": 1e-06, "loss": 0.0999, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 1415.52734375, "completions/mean_terminated_length": 582.23095703125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.31011515855789185, "epoch": 4.0552631578947365, "frac_reward_zero_std": 0.40625, "grad_norm": 137.6245574951172, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 516117534.0, "reward": 0.7666909694671631, "reward_std": 0.1560915857553482, "rewards/progression_diversity/mean": -0.02036316879093647, "rewards/progression_diversity/std": 0.09054167568683624, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.9065755009651184, "rewards/symbolic_reward_partial_score/std": 0.24309590458869934, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058464765548706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.0097904205322266, "step": 1541 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.32095426321029663, "epoch": 4.057894736842106, "grad_norm": 0.008144257590174675, "learning_rate": 1e-06, "loss": 0.0401, "step": 1542 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30426955223083496, "epoch": 4.060526315789474, "grad_norm": 0.016386935487389565, "learning_rate": 1e-06, "loss": 0.119, "step": 1543 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3272694945335388, "epoch": 4.063157894736842, "grad_norm": 39.390586853027344, "learning_rate": 1e-06, "loss": 0.0768, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3431.0, "completions/mean_length": 1677.0234375, "completions/mean_terminated_length": 597.8951416015625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.3231520652770996, "epoch": 4.065789473684211, "frac_reward_zero_std": 0.375, "grad_norm": 80.80924987792969, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 517367626.0, "reward": 0.7787384390830994, "reward_std": 0.16898325085639954, "rewards/progression_diversity/mean": -0.02655177377164364, "rewards/progression_diversity/std": 0.1030859723687172, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.9169921875, "rewards/symbolic_reward_partial_score/std": 0.24586138129234314, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506144762039185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 4.009866714477539, "step": 1545 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3029029667377472, "epoch": 4.068421052631579, "grad_norm": 0.010388639755547047, "learning_rate": 1e-06, "loss": 0.1362, "step": 1546 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3171136975288391, "epoch": 4.071052631578947, "grad_norm": 0.00802148599177599, "learning_rate": 1e-06, "loss": 0.06, "step": 1547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.28475718200206757, "epoch": 4.073684210526316, "grad_norm": 0.005422931630164385, "learning_rate": 1e-06, "loss": 0.291, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 1218.509765625, "completions/mean_terminated_length": 665.9210815429688, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.32124534249305725, "epoch": 4.076315789473684, "frac_reward_zero_std": 0.3125, "grad_norm": 359.06634521484375, "learning_rate": 1e-06, "loss": 0.0374, "num_tokens": 518415663.0, "reward": 0.8058355450630188, "reward_std": 0.17662674188613892, "rewards/progression_diversity/mean": -0.012153420597314835, "rewards/progression_diversity/std": 0.06827174872159958, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9397786259651184, "rewards/symbolic_reward_partial_score/std": 0.19970273971557617, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600073337554932, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.2380099296569824, "step": 1549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.29867467284202576, "epoch": 4.078947368421052, "grad_norm": 0.015004157088696957, "learning_rate": 1e-06, "loss": 0.1029, "step": 1550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3169206827878952, "epoch": 4.081578947368421, "grad_norm": 0.020958561450242996, "learning_rate": 1e-06, "loss": 0.0588, "step": 1551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3124808520078659, "epoch": 4.08421052631579, "grad_norm": 0.011239241808652878, "learning_rate": 1e-06, "loss": 0.0997, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 1185.564453125, "completions/mean_terminated_length": 599.823486328125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.315788209438324, "epoch": 4.086842105263158, "frac_reward_zero_std": 0.4375, "grad_norm": 902.8833618164062, "learning_rate": 1e-06, "loss": 0.1105, "num_tokens": 519423472.0, "reward": 0.816677451133728, "reward_std": 0.13706813752651215, "rewards/progression_diversity/mean": -0.011946848593652248, "rewards/progression_diversity/std": 0.0664309710264206, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9440103769302368, "rewards/symbolic_reward_partial_score/std": 0.19122956693172455, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609365701675415, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.0578854084014893, "step": 1553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3234410434961319, "epoch": 4.089473684210526, "grad_norm": 0.014460497535765171, "learning_rate": 1e-06, "loss": 0.0497, "step": 1554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31574487686157227, "epoch": 4.092105263157895, "grad_norm": 0.00562589755281806, "learning_rate": 1e-06, "loss": 0.0687, "step": 1555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30954110622406006, "epoch": 4.094736842105263, "grad_norm": 0.011875905096530914, "learning_rate": 1e-06, "loss": 0.0327, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3258.0, "completions/mean_length": 1236.6171875, "completions/mean_terminated_length": 588.7658081054688, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.317048043012619, "epoch": 4.097368421052631, "frac_reward_zero_std": 0.46875, "grad_norm": 72.20258331298828, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 520449900.0, "reward": 0.8271297216415405, "reward_std": 0.1219562292098999, "rewards/progression_diversity/mean": -0.016525140032172203, "rewards/progression_diversity/std": 0.0804808959364891, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9503580331802368, "rewards/symbolic_reward_partial_score/std": 0.1865689754486084, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0606681108474731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.4164552688598633, "step": 1557 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3193093240261078, "epoch": 4.1, "grad_norm": 0.008888328447937965, "learning_rate": 1e-06, "loss": 0.0658, "step": 1558 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3196675330400467, "epoch": 4.102631578947369, "grad_norm": 0.007627990562468767, "learning_rate": 1e-06, "loss": 0.0829, "step": 1559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31077609956264496, "epoch": 4.105263157894737, "grad_norm": 0.011663636192679405, "learning_rate": 1e-06, "loss": 0.1007, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 8140.0, "completions/mean_length": 1069.8125, "completions/mean_terminated_length": 607.6136474609375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.34023210406303406, "epoch": 4.1078947368421055, "frac_reward_zero_std": 0.4375, "grad_norm": 0.020510073751211166, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 521379756.0, "reward": 0.8407922387123108, "reward_std": 0.14206728339195251, "rewards/progression_diversity/mean": -0.012575984001159668, "rewards/progression_diversity/std": 0.07108542323112488, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, "rewards/symbolic_reward_partial_score/std": 0.1664878875017166, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0578272342681885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 350.0, "sampling/sampling_logp_difference/mean": 2.6809022426605225, "step": 1561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31028036773204803, "epoch": 4.110526315789474, "grad_norm": 0.0143095962703228, "learning_rate": 1e-06, "loss": 0.1249, "step": 1562 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30853691697120667, "epoch": 4.113157894736842, "grad_norm": 0.05083174258470535, "learning_rate": 1e-06, "loss": 0.1291, "step": 1563 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.32772916555404663, "epoch": 4.11578947368421, "grad_norm": 0.020028606057167053, "learning_rate": 1e-06, "loss": 0.0466, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15382.0, "completions/mean_length": 1305.990234375, "completions/mean_terminated_length": 693.06298828125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.30608853697776794, "epoch": 4.118421052631579, "frac_reward_zero_std": 0.4375, "grad_norm": 67.87190246582031, "learning_rate": 1e-06, "loss": 0.0841, "num_tokens": 522466247.0, "reward": 0.8353767395019531, "reward_std": 0.13427817821502686, "rewards/progression_diversity/mean": -0.017013559117913246, "rewards/progression_diversity/std": 0.08219697326421738, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9505208134651184, "rewards/symbolic_reward_partial_score/std": 0.19484204053878784, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0560702085494995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.4243669509887695, "step": 1565 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.29687653481960297, "epoch": 4.121052631578947, "grad_norm": 0.006236726883798838, "learning_rate": 1e-06, "loss": 0.1279, "step": 1566 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3095232844352722, "epoch": 4.123684210526315, "grad_norm": 0.002965538529679179, "learning_rate": 1e-06, "loss": 0.0331, "step": 1567 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.313509002327919, "epoch": 4.126315789473685, "grad_norm": 0.018090499565005302, "learning_rate": 1e-06, "loss": 0.0931, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 1349.125, "completions/mean_terminated_length": 577.314208984375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.2996673285961151, "epoch": 4.128947368421053, "frac_reward_zero_std": 0.46875, "grad_norm": 218.44454956054688, "learning_rate": 1e-06, "loss": 0.0891, "num_tokens": 523526119.0, "reward": 0.8193434476852417, "reward_std": 0.13413912057876587, "rewards/progression_diversity/mean": -0.018784930929541588, "rewards/progression_diversity/std": 0.0878358706831932, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9388021230697632, "rewards/symbolic_reward_partial_score/std": 0.21514074504375458, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.054738998413086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.5745482444763184, "step": 1569 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.31027545034885406, "epoch": 4.131578947368421, "grad_norm": 0.004378543235361576, "learning_rate": 1e-06, "loss": 0.0894, "step": 1570 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31161196529865265, "epoch": 4.13421052631579, "grad_norm": 0.005518985912203789, "learning_rate": 1e-06, "loss": 0.1057, "step": 1571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.322713166475296, "epoch": 4.136842105263158, "grad_norm": 0.005724793300032616, "learning_rate": 1e-06, "loss": 0.023, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3315.0, "completions/mean_length": 1232.330078125, "completions/mean_terminated_length": 616.4085083007812, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.31801025569438934, "epoch": 4.139473684210526, "frac_reward_zero_std": 0.5, "grad_norm": 114.2596664428711, "learning_rate": 1e-06, "loss": 0.0698, "num_tokens": 524540528.0, "reward": 0.8093826770782471, "reward_std": 0.1396641731262207, "rewards/progression_diversity/mean": -0.013879730366170406, "rewards/progression_diversity/std": 0.07435765862464905, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9444987177848816, "rewards/symbolic_reward_partial_score/std": 0.1908387839794159, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.054489254951477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.705132484436035, "step": 1573 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31481173634529114, "epoch": 4.1421052631578945, "grad_norm": 0.011268100701272488, "learning_rate": 1e-06, "loss": 0.0299, "step": 1574 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30692553520202637, "epoch": 4.144736842105263, "grad_norm": 0.013008923269808292, "learning_rate": 1e-06, "loss": 0.1172, "step": 1575 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30435749888420105, "epoch": 4.147368421052631, "grad_norm": 0.016314802691340446, "learning_rate": 1e-06, "loss": 0.1019, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1044.396484375, "completions/mean_terminated_length": 644.7675170898438, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.329788476228714, "epoch": 4.15, "frac_reward_zero_std": 0.46875, "grad_norm": 0.022590087726712227, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 525490075.0, "reward": 0.8446779847145081, "reward_std": 0.13736554980278015, "rewards/progression_diversity/mean": -0.009744609706103802, "rewards/progression_diversity/std": 0.06334702670574188, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9611002206802368, "rewards/symbolic_reward_partial_score/std": 0.16516920924186707, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0595576763153076, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 1.8629345893859863, "step": 1577 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.310794472694397, "epoch": 4.152631578947369, "grad_norm": 0.0059421774931252, "learning_rate": 1e-06, "loss": 0.0536, "step": 1578 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30790117383003235, "epoch": 4.155263157894737, "grad_norm": 1.5674898624420166, "learning_rate": 1e-06, "loss": 0.0615, "step": 1579 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31684084236621857, "epoch": 4.157894736842105, "grad_norm": 0.006289552431553602, "learning_rate": 1e-06, "loss": 0.049, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 1460.595703125, "completions/mean_terminated_length": 726.6577758789062, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.2980321794748306, "epoch": 4.160526315789474, "frac_reward_zero_std": 0.375, "grad_norm": 215.46676635742188, "learning_rate": 1e-06, "loss": 0.0781, "num_tokens": 526672588.0, "reward": 0.8091208934783936, "reward_std": 0.16659247875213623, "rewards/progression_diversity/mean": -0.015641260892152786, "rewards/progression_diversity/std": 0.07609428465366364, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, "rewards/symbolic_reward_partial_score/std": 0.20248618721961975, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047804594039917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 3.248504400253296, "step": 1581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.28611473739147186, "epoch": 4.163157894736842, "grad_norm": 5.547037124633789, "learning_rate": 1e-06, "loss": 0.0783, "step": 1582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3098125159740448, "epoch": 4.16578947368421, "grad_norm": 0.0198947936296463, "learning_rate": 1e-06, "loss": 0.0761, "step": 1583 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3102129250764847, "epoch": 4.168421052631579, "grad_norm": 0.020427729934453964, "learning_rate": 1e-06, "loss": 0.0666, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 1271.625, "completions/mean_terminated_length": 689.2008056640625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.3241703659296036, "epoch": 4.171052631578948, "frac_reward_zero_std": 0.5, "grad_norm": 19.84275245666504, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 527726124.0, "reward": 0.8260881900787354, "reward_std": 0.1229025349020958, "rewards/progression_diversity/mean": -0.013252872042357922, "rewards/progression_diversity/std": 0.07066137343645096, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.94482421875, "rewards/symbolic_reward_partial_score/std": 0.19333821535110474, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0540461540222168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 2.846632480621338, "step": 1585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30269454419612885, "epoch": 4.173684210526316, "grad_norm": 1.6151353120803833, "learning_rate": 1e-06, "loss": 0.0914, "step": 1586 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.32799434661865234, "epoch": 4.176315789473684, "grad_norm": 0.03124081902205944, "learning_rate": 1e-06, "loss": 0.007, "step": 1587 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3005277067422867, "epoch": 4.178947368421053, "grad_norm": 0.007625295780599117, "learning_rate": 1e-06, "loss": 0.1777, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 1715.181640625, "completions/mean_terminated_length": 671.7928466796875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.30243825912475586, "epoch": 4.181578947368421, "frac_reward_zero_std": 0.28125, "grad_norm": 418.0889587402344, "learning_rate": 1e-06, "loss": 0.1181, "num_tokens": 528990761.0, "reward": 0.7599914073944092, "reward_std": 0.15093812346458435, "rewards/progression_diversity/mean": -0.021372389048337936, "rewards/progression_diversity/std": 0.0902642160654068, "rewards/symbolic_reward_accuracy/mean": 0.826171875, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.8992512822151184, "rewards/symbolic_reward_partial_score/std": 0.2561655044555664, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0439589023590088, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 348.0, "sampling/sampling_logp_difference/mean": 3.9997878074645996, "step": 1589 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29131756722927094, "epoch": 4.184210526315789, "grad_norm": 0.009967650286853313, "learning_rate": 1e-06, "loss": 0.1418, "step": 1590 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.288121297955513, "epoch": 4.186842105263158, "grad_norm": 0.011154073290526867, "learning_rate": 1e-06, "loss": 0.1733, "step": 1591 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.32074061036109924, "epoch": 4.189473684210526, "grad_norm": 0.015442883595824242, "learning_rate": 1e-06, "loss": 0.057, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 1718.6875, "completions/mean_terminated_length": 741.0000610351562, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.3073541969060898, "epoch": 4.192105263157894, "frac_reward_zero_std": 0.5625, "grad_norm": 62.47317123413086, "learning_rate": 1e-06, "loss": 0.0488, "num_tokens": 530279049.0, "reward": 0.776131272315979, "reward_std": 0.11894486844539642, "rewards/progression_diversity/mean": -0.01871039718389511, "rewards/progression_diversity/std": 0.07976720482110977, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9021810293197632, "rewards/symbolic_reward_partial_score/std": 0.25698360800743103, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0540015697479248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 346.0, "sampling/sampling_logp_difference/mean": 2.4201743602752686, "step": 1593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29900647699832916, "epoch": 4.1947368421052635, "grad_norm": 103.9713363647461, "learning_rate": 1e-06, "loss": 0.0922, "step": 1594 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31466494500637054, "epoch": 4.197368421052632, "grad_norm": 0.0200185626745224, "learning_rate": 1e-06, "loss": 0.0658, "step": 1595 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.30266304314136505, "epoch": 4.2, "grad_norm": 0.011375721544027328, "learning_rate": 1e-06, "loss": 0.0875, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 1656.662109375, "completions/mean_terminated_length": 740.0228881835938, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.3035944253206253, "epoch": 4.2026315789473685, "frac_reward_zero_std": 0.3125, "grad_norm": 208.2892608642578, "learning_rate": 1e-06, "loss": 0.0753, "num_tokens": 531551420.0, "reward": 0.7731266021728516, "reward_std": 0.16756606101989746, "rewards/progression_diversity/mean": -0.021324899047613144, "rewards/progression_diversity/std": 0.08820579946041107, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.9150390625, "rewards/symbolic_reward_partial_score/std": 0.22819702327251434, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0414807796478271, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 350.0, "sampling/sampling_logp_difference/mean": 3.9151625633239746, "step": 1597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3109179735183716, "epoch": 4.205263157894737, "grad_norm": 0.0169313196092844, "learning_rate": 1e-06, "loss": 0.1055, "step": 1598 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31678929924964905, "epoch": 4.207894736842105, "grad_norm": 0.07289431989192963, "learning_rate": 1e-06, "loss": 0.0648, "step": 1599 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2846733182668686, "epoch": 4.2105263157894735, "grad_norm": 0.020705915987491608, "learning_rate": 1e-06, "loss": 0.1821, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4576.0, "completions/mean_length": 2429.921875, "completions/mean_terminated_length": 784.6812133789062, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.2895173579454422, "epoch": 4.213157894736842, "frac_reward_zero_std": 0.40625, "grad_norm": 806.2984619140625, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 533202356.0, "reward": 0.7311378717422485, "reward_std": 0.14681148529052734, "rewards/progression_diversity/mean": -0.030749481171369553, "rewards/progression_diversity/std": 0.10168987512588501, "rewards/symbolic_reward_accuracy/mean": 0.794921875, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.876953125, "rewards/symbolic_reward_partial_score/std": 0.2930140495300293, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0381789207458496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 352.0, "sampling/sampling_logp_difference/mean": 4.134341239929199, "step": 1601 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.28859490156173706, "epoch": 4.215789473684211, "grad_norm": 0.0084293307736516, "learning_rate": 1e-06, "loss": 0.0704, "step": 1602 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2843547612428665, "epoch": 4.218421052631579, "grad_norm": 0.0123353386297822, "learning_rate": 1e-06, "loss": 0.1586, "step": 1603 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.27717554569244385, "epoch": 4.221052631578948, "grad_norm": 0.020575549453496933, "learning_rate": 1e-06, "loss": 0.1076, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 1855.916015625, "completions/mean_terminated_length": 757.1533813476562, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.2824231684207916, "epoch": 4.223684210526316, "frac_reward_zero_std": 0.34375, "grad_norm": 216.76248168945312, "learning_rate": 1e-06, "loss": 0.1544, "num_tokens": 534562409.0, "reward": 0.7700595259666443, "reward_std": 0.1850237250328064, "rewards/progression_diversity/mean": -0.025300273671746254, "rewards/progression_diversity/std": 0.09943580627441406, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.908203125, "rewards/symbolic_reward_partial_score/std": 0.24698437750339508, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0366488695144653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 356.0, "sampling/sampling_logp_difference/mean": 4.752798080444336, "step": 1605 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30687089264392853, "epoch": 4.226315789473684, "grad_norm": 0.03631648048758507, "learning_rate": 1e-06, "loss": 0.0767, "step": 1606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2971319705247879, "epoch": 4.228947368421053, "grad_norm": 0.016691574826836586, "learning_rate": 1e-06, "loss": 0.1243, "step": 1607 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.29933421313762665, "epoch": 4.231578947368421, "grad_norm": 0.024404583498835564, "learning_rate": 1e-06, "loss": 0.0898, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1166.900390625, "completions/mean_terminated_length": 644.29296875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.3166111558675766, "epoch": 4.234210526315789, "frac_reward_zero_std": 0.53125, "grad_norm": 546.3336791992188, "learning_rate": 1e-06, "loss": 0.0718, "num_tokens": 535559318.0, "reward": 0.8583120107650757, "reward_std": 0.08758790791034698, "rewards/progression_diversity/mean": -0.013530386611819267, "rewards/progression_diversity/std": 0.0759081244468689, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9734700918197632, "rewards/symbolic_reward_partial_score/std": 0.13808730244636536, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050557017326355, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 356.0, "sampling/sampling_logp_difference/mean": 3.118114471435547, "step": 1609 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31313398480415344, "epoch": 4.2368421052631575, "grad_norm": 0.01867660880088806, "learning_rate": 1e-06, "loss": 0.1433, "step": 1610 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31812839210033417, "epoch": 4.239473684210527, "grad_norm": 0.00977341365069151, "learning_rate": 1e-06, "loss": 0.0707, "step": 1611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.32100366055965424, "epoch": 4.242105263157895, "grad_norm": 0.006353206932544708, "learning_rate": 1e-06, "loss": 0.0522, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1447.01953125, "completions/mean_terminated_length": 712.4138793945312, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.286122590303421, "epoch": 4.244736842105263, "frac_reward_zero_std": 0.3125, "grad_norm": 307.3343200683594, "learning_rate": 1e-06, "loss": 0.1766, "num_tokens": 536710016.0, "reward": 0.7893308401107788, "reward_std": 0.1426737904548645, "rewards/progression_diversity/mean": -0.017116881906986237, "rewards/progression_diversity/std": 0.0786111056804657, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.9259439706802368, "rewards/symbolic_reward_partial_score/std": 0.20985758304595947, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420353412628174, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 358.0, "sampling/sampling_logp_difference/mean": 4.076284408569336, "step": 1613 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.29413002729415894, "epoch": 4.247368421052632, "grad_norm": 0.011863662861287594, "learning_rate": 1e-06, "loss": 0.1084, "step": 1614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.32583458721637726, "epoch": 4.25, "grad_norm": 0.009227105416357517, "learning_rate": 1e-06, "loss": 0.0131, "step": 1615 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30109357833862305, "epoch": 4.252631578947368, "grad_norm": 0.08455037325620651, "learning_rate": 1e-06, "loss": 0.0389, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 1303.62109375, "completions/mean_terminated_length": 690.5975341796875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.3038036525249481, "epoch": 4.255263157894737, "frac_reward_zero_std": 0.46875, "grad_norm": 244.20498657226562, "learning_rate": 1e-06, "loss": 0.1133, "num_tokens": 537764254.0, "reward": 0.804751992225647, "reward_std": 0.1407267451286316, "rewards/progression_diversity/mean": -0.01308908686041832, "rewards/progression_diversity/std": 0.06904362142086029, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, "rewards/symbolic_reward_partial_score/std": 0.21590134501457214, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049241065979004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 360.0, "sampling/sampling_logp_difference/mean": 3.09298038482666, "step": 1617 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31994016468524933, "epoch": 4.257894736842105, "grad_norm": 0.010734348557889462, "learning_rate": 1e-06, "loss": 0.0265, "step": 1618 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30486488342285156, "epoch": 4.260526315789473, "grad_norm": 0.010809490457177162, "learning_rate": 1e-06, "loss": 0.0587, "step": 1619 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2901280075311661, "epoch": 4.2631578947368425, "grad_norm": 0.03101206384599209, "learning_rate": 1e-06, "loss": 0.086, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3629.0, "completions/mean_length": 1185.048828125, "completions/mean_terminated_length": 663.064697265625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.3062327206134796, "epoch": 4.265789473684211, "frac_reward_zero_std": 0.34375, "grad_norm": 202.47483825683594, "learning_rate": 1e-06, "loss": 0.0557, "num_tokens": 538737751.0, "reward": 0.8110127449035645, "reward_std": 0.16020728647708893, "rewards/progression_diversity/mean": -0.012012619525194168, "rewards/progression_diversity/std": 0.06591516733169556, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9381510019302368, "rewards/symbolic_reward_partial_score/std": 0.19927191734313965, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049810528755188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 360.0, "sampling/sampling_logp_difference/mean": 3.169362783432007, "step": 1621 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2987634390592575, "epoch": 4.268421052631579, "grad_norm": 141.50709533691406, "learning_rate": 1e-06, "loss": 0.1229, "step": 1622 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3115040361881256, "epoch": 4.271052631578947, "grad_norm": 0.011638534255325794, "learning_rate": 1e-06, "loss": 0.0587, "step": 1623 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.31850266456604004, "epoch": 4.273684210526316, "grad_norm": 0.00664818100631237, "learning_rate": 1e-06, "loss": 0.0181, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1301.916015625, "completions/mean_terminated_length": 752.3663940429688, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.3117539584636688, "epoch": 4.276315789473684, "frac_reward_zero_std": 0.34375, "grad_norm": 244.9304962158203, "learning_rate": 1e-06, "loss": 0.0613, "num_tokens": 539834028.0, "reward": 0.8020786046981812, "reward_std": 0.16100260615348816, "rewards/progression_diversity/mean": -0.011866888031363487, "rewards/progression_diversity/std": 0.06344626098871231, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.93896484375, "rewards/symbolic_reward_partial_score/std": 0.1891382783651352, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0457390546798706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 364.0, "sampling/sampling_logp_difference/mean": 3.5810694694519043, "step": 1625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2959267497062683, "epoch": 4.278947368421052, "grad_norm": 0.020144561305642128, "learning_rate": 1e-06, "loss": 0.1104, "step": 1626 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3018384575843811, "epoch": 4.281578947368421, "grad_norm": 0.014604386873543262, "learning_rate": 1e-06, "loss": 0.0735, "step": 1627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30800075829029083, "epoch": 4.284210526315789, "grad_norm": 0.014517586678266525, "learning_rate": 1e-06, "loss": 0.0251, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3842.0, "completions/mean_length": 956.640625, "completions/mean_terminated_length": 711.761962890625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.3077455312013626, "epoch": 4.286842105263158, "frac_reward_zero_std": 0.5625, "grad_norm": 398.7921447753906, "learning_rate": 1e-06, "loss": 0.0327, "num_tokens": 540715284.0, "reward": 0.847118616104126, "reward_std": 0.09315143525600433, "rewards/progression_diversity/mean": -0.004935206845402718, "rewards/progression_diversity/std": 0.043800655752420425, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9690755605697632, "rewards/symbolic_reward_partial_score/std": 0.13879333436489105, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059929609298706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 364.0, "sampling/sampling_logp_difference/mean": 1.7687201499938965, "step": 1629 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3137192130088806, "epoch": 4.2894736842105265, "grad_norm": 0.01701788231730461, "learning_rate": 1e-06, "loss": 0.0515, "step": 1630 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.32366204261779785, "epoch": 4.292105263157895, "grad_norm": 0.03292662650346756, "learning_rate": 1e-06, "loss": 0.0018, "step": 1631 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3094526529312134, "epoch": 4.294736842105263, "grad_norm": 0.02371911332011223, "learning_rate": 1e-06, "loss": 0.064, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4493.0, "completions/mean_length": 1390.392578125, "completions/mean_terminated_length": 717.2101440429688, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.30759990215301514, "epoch": 4.2973684210526315, "frac_reward_zero_std": 0.59375, "grad_norm": 396.1076965332031, "learning_rate": 1e-06, "loss": 0.082, "num_tokens": 541829469.0, "reward": 0.7947248220443726, "reward_std": 0.0687393993139267, "rewards/progression_diversity/mean": -0.009939271956682205, "rewards/progression_diversity/std": 0.055136967450380325, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.9267578125, "rewards/symbolic_reward_partial_score/std": 0.2075078934431076, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.054058313369751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 364.0, "sampling/sampling_logp_difference/mean": 2.6795334815979004, "step": 1633 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3071286678314209, "epoch": 4.3, "grad_norm": 0.010865924879908562, "learning_rate": 1e-06, "loss": 0.0643, "step": 1634 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.31138360500335693, "epoch": 4.302631578947368, "grad_norm": 0.012006482109427452, "learning_rate": 1e-06, "loss": 0.0281, "step": 1635 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3028770089149475, "epoch": 4.3052631578947365, "grad_norm": 0.30845019221305847, "learning_rate": 1e-06, "loss": 0.0632, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 868.23828125, "completions/mean_terminated_length": 590.6202392578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.3321658670902252, "epoch": 4.307894736842106, "frac_reward_zero_std": 0.53125, "grad_norm": 0.008871855214238167, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 542626583.0, "reward": 0.8462392091751099, "reward_std": 0.11010201275348663, "rewards/progression_diversity/mean": -0.0049893660470843315, "rewards/progression_diversity/std": 0.04094173386693001, "rewards/symbolic_reward_accuracy/mean": 0.927734375, "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, "rewards/symbolic_reward_partial_score/mean": 0.970703125, "rewards/symbolic_reward_partial_score/std": 0.12699371576309204, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0612621307373047, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 2.047287940979004, "step": 1637 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3181557357311249, "epoch": 4.310526315789474, "grad_norm": 0.023050582036376, "learning_rate": 1e-06, "loss": 0.0581, "step": 1638 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3208106458187103, "epoch": 4.313157894736842, "grad_norm": 0.01196917425841093, "learning_rate": 1e-06, "loss": 0.0703, "step": 1639 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3326588422060013, "epoch": 4.315789473684211, "grad_norm": 0.011661848984658718, "learning_rate": 1e-06, "loss": -0.0092, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 1302.34765625, "completions/mean_terminated_length": 752.8137817382812, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.2890154719352722, "epoch": 4.318421052631579, "frac_reward_zero_std": 0.40625, "grad_norm": 436.7637939453125, "learning_rate": 1e-06, "loss": 0.1363, "num_tokens": 543714537.0, "reward": 0.8115625977516174, "reward_std": 0.11080306768417358, "rewards/progression_diversity/mean": -0.010732462629675865, "rewards/progression_diversity/std": 0.062084443867206573, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9412435293197632, "rewards/symbolic_reward_partial_score/std": 0.18950168788433075, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048242211341858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 3.6092000007629395, "step": 1641 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2962608188390732, "epoch": 4.321052631578947, "grad_norm": 0.027114521712064743, "learning_rate": 1e-06, "loss": 0.0896, "step": 1642 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3004533350467682, "epoch": 4.323684210526316, "grad_norm": 0.021577974781394005, "learning_rate": 1e-06, "loss": 0.0285, "step": 1643 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.30283403396606445, "epoch": 4.326315789473684, "grad_norm": 0.01051260158419609, "learning_rate": 1e-06, "loss": 0.0431, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 1617.533203125, "completions/mean_terminated_length": 730.9337768554688, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.28192102909088135, "epoch": 4.328947368421053, "frac_reward_zero_std": 0.40625, "grad_norm": 224.36424255371094, "learning_rate": 1e-06, "loss": 0.0977, "num_tokens": 544947706.0, "reward": 0.7883968949317932, "reward_std": 0.1514188051223755, "rewards/progression_diversity/mean": -0.017739124596118927, "rewards/progression_diversity/std": 0.07712989300489426, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.9241536259651184, "rewards/symbolic_reward_partial_score/std": 0.2255030870437622, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0469114780426025, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.760016441345215, "step": 1645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30050475895404816, "epoch": 4.331578947368421, "grad_norm": 0.010516730137169361, "learning_rate": 1e-06, "loss": 0.0643, "step": 1646 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.29950810968875885, "epoch": 4.33421052631579, "grad_norm": 4.544126510620117, "learning_rate": 1e-06, "loss": 0.0271, "step": 1647 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.29230254888534546, "epoch": 4.336842105263158, "grad_norm": 4.5623393058776855, "learning_rate": 1e-06, "loss": 0.0658, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 1151.015625, "completions/mean_terminated_length": 659.6290283203125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.31469690799713135, "epoch": 4.339473684210526, "frac_reward_zero_std": 0.53125, "grad_norm": 301.08685302734375, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 545939170.0, "reward": 0.8617491722106934, "reward_std": 0.09493400156497955, "rewards/progression_diversity/mean": -0.011606581509113312, "rewards/progression_diversity/std": 0.06665636599063873, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9777017831802368, "rewards/symbolic_reward_partial_score/std": 0.12183935195207596, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512254238128662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 3.1432154178619385, "step": 1649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30177412927150726, "epoch": 4.342105263157895, "grad_norm": 318.8957214355469, "learning_rate": 1e-06, "loss": 0.0878, "step": 1650 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2994520217180252, "epoch": 4.344736842105263, "grad_norm": 0.24904422461986542, "learning_rate": 1e-06, "loss": 0.0613, "step": 1651 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3131738752126694, "epoch": 4.347368421052631, "grad_norm": 0.016566958278417587, "learning_rate": 1e-06, "loss": 0.0589, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1417.859375, "completions/mean_terminated_length": 649.5770263671875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.2909892499446869, "epoch": 4.35, "frac_reward_zero_std": 0.5, "grad_norm": 281.6248474121094, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 547068762.0, "reward": 0.8414824604988098, "reward_std": 0.0903344452381134, "rewards/progression_diversity/mean": -0.016799690201878548, "rewards/progression_diversity/std": 0.07866127043962479, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9656575322151184, "rewards/symbolic_reward_partial_score/std": 0.1427777260541916, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0441806316375732, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 4.0731892585754395, "step": 1653 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29775403439998627, "epoch": 4.352631578947369, "grad_norm": 1.3435653448104858, "learning_rate": 1e-06, "loss": 0.0371, "step": 1654 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.27768969535827637, "epoch": 4.355263157894737, "grad_norm": 0.009355852380394936, "learning_rate": 1e-06, "loss": 0.1486, "step": 1655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3120851516723633, "epoch": 4.3578947368421055, "grad_norm": 0.5682896375656128, "learning_rate": 1e-06, "loss": 0.0239, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 1578.765625, "completions/mean_terminated_length": 722.264404296875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.2924637943506241, "epoch": 4.360526315789474, "frac_reward_zero_std": 0.375, "grad_norm": 165.94813537597656, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 548296738.0, "reward": 0.8284804821014404, "reward_std": 0.11700120568275452, "rewards/progression_diversity/mean": -0.018165672197937965, "rewards/progression_diversity/std": 0.07916979491710663, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9549153447151184, "rewards/symbolic_reward_partial_score/std": 0.1701822131872177, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041003704071045, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 4.021053314208984, "step": 1657 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2971417158842087, "epoch": 4.363157894736842, "grad_norm": 0.03589131683111191, "learning_rate": 1e-06, "loss": 0.082, "step": 1658 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.28301428258419037, "epoch": 4.36578947368421, "grad_norm": 0.021585499867796898, "learning_rate": 1e-06, "loss": 0.1136, "step": 1659 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.28591735661029816, "epoch": 4.368421052631579, "grad_norm": 0.006448809057474136, "learning_rate": 1e-06, "loss": 0.131, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1222.650390625, "completions/mean_terminated_length": 701.9575805664062, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.29014408588409424, "epoch": 4.371052631578947, "frac_reward_zero_std": 0.4375, "grad_norm": 705.5008544921875, "learning_rate": 1e-06, "loss": 0.0896, "num_tokens": 549336047.0, "reward": 0.8229233026504517, "reward_std": 0.11057721078395844, "rewards/progression_diversity/mean": -0.012355408631265163, "rewards/progression_diversity/std": 0.06910999119281769, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, "rewards/symbolic_reward_partial_score/std": 0.1456439197063446, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0430617332458496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 3.94387149810791, "step": 1661 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2902925908565521, "epoch": 4.373684210526315, "grad_norm": 184.36441040039062, "learning_rate": 1e-06, "loss": 0.1117, "step": 1662 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2993343472480774, "epoch": 4.376315789473685, "grad_norm": 0.006076624616980553, "learning_rate": 1e-06, "loss": 0.0502, "step": 1663 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.30178727209568024, "epoch": 4.378947368421053, "grad_norm": 0.00712067075073719, "learning_rate": 1e-06, "loss": 0.0184, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3090.0, "completions/mean_length": 1699.12109375, "completions/mean_terminated_length": 752.6943969726562, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.2760689854621887, "epoch": 4.381578947368421, "frac_reward_zero_std": 0.34375, "grad_norm": 577.8743896484375, "learning_rate": 1e-06, "loss": 0.0909, "num_tokens": 550629997.0, "reward": 0.8087052702903748, "reward_std": 0.1341739296913147, "rewards/progression_diversity/mean": -0.018151385709643364, "rewards/progression_diversity/std": 0.07787839323282242, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9462890625, "rewards/symbolic_reward_partial_score/std": 0.18004465103149414, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0394903421401978, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 364.0, "sampling/sampling_logp_difference/mean": 4.506129741668701, "step": 1665 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.26462263613939285, "epoch": 4.38421052631579, "grad_norm": 1588.7640380859375, "learning_rate": 1e-06, "loss": 0.3391, "step": 1666 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.29205040633678436, "epoch": 4.386842105263158, "grad_norm": 0.009880495257675648, "learning_rate": 1e-06, "loss": 0.0564, "step": 1667 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.28884167969226837, "epoch": 4.389473684210526, "grad_norm": 0.010468707419931889, "learning_rate": 1e-06, "loss": 0.0718, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3253.0, "completions/mean_length": 1559.23828125, "completions/mean_terminated_length": 733.9423217773438, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.26679860055446625, "epoch": 4.3921052631578945, "frac_reward_zero_std": 0.4375, "grad_norm": 465.1615295410156, "learning_rate": 1e-06, "loss": 0.1201, "num_tokens": 551842663.0, "reward": 0.7915906310081482, "reward_std": 0.1362362802028656, "rewards/progression_diversity/mean": -0.015742970630526543, "rewards/progression_diversity/std": 0.07118507474660873, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9275715947151184, "rewards/symbolic_reward_partial_score/std": 0.2216237634420395, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0484164953231812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 3.136202096939087, "step": 1669 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2917633354663849, "epoch": 4.394736842105263, "grad_norm": 0.009237710386514664, "learning_rate": 1e-06, "loss": 0.0597, "step": 1670 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.29464130103588104, "epoch": 4.397368421052631, "grad_norm": 0.020612401887774467, "learning_rate": 1e-06, "loss": 0.0545, "step": 1671 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29814429581165314, "epoch": 4.4, "grad_norm": 0.023844098672270775, "learning_rate": 1e-06, "loss": 0.0525, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3713.0, "completions/mean_length": 1361.876953125, "completions/mean_terminated_length": 655.31494140625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.29356473684310913, "epoch": 4.402631578947369, "frac_reward_zero_std": 0.375, "grad_norm": 48.91466522216797, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 552934888.0, "reward": 0.8409208655357361, "reward_std": 0.11717473715543747, "rewards/progression_diversity/mean": -0.01435843575745821, "rewards/progression_diversity/std": 0.06984733790159225, "rewards/symbolic_reward_accuracy/mean": 0.927734375, "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, "rewards/symbolic_reward_partial_score/mean": 0.95849609375, "rewards/symbolic_reward_partial_score/std": 0.1762569397687912, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0481362342834473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 366.0, "sampling/sampling_logp_difference/mean": 3.0723586082458496, "step": 1673 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.28971320390701294, "epoch": 4.405263157894737, "grad_norm": 17.948726654052734, "learning_rate": 1e-06, "loss": 0.1098, "step": 1674 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.29446932673454285, "epoch": 4.407894736842105, "grad_norm": 0.010464577004313469, "learning_rate": 1e-06, "loss": 0.0555, "step": 1675 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.28618983924388885, "epoch": 4.410526315789474, "grad_norm": 0.010811169631779194, "learning_rate": 1e-06, "loss": 0.1078, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 1110.826171875, "completions/mean_terminated_length": 649.8651733398438, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.29554495215415955, "epoch": 4.413157894736842, "frac_reward_zero_std": 0.3125, "grad_norm": 467.7513427734375, "learning_rate": 1e-06, "loss": 0.1092, "num_tokens": 553879247.0, "reward": 0.8643869161605835, "reward_std": 0.11433817446231842, "rewards/progression_diversity/mean": -0.011511346325278282, "rewards/progression_diversity/std": 0.06805232167243958, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.9793294668197632, "rewards/symbolic_reward_partial_score/std": 0.1068115308880806, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465220212936401, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.3040924072265625, "step": 1677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2996697723865509, "epoch": 4.41578947368421, "grad_norm": 0.005097354296594858, "learning_rate": 1e-06, "loss": 0.0766, "step": 1678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.30967849493026733, "epoch": 4.418421052631579, "grad_norm": 0.008583194576203823, "learning_rate": 1e-06, "loss": 0.0206, "step": 1679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2984393239021301, "epoch": 4.421052631578947, "grad_norm": 0.020960450172424316, "learning_rate": 1e-06, "loss": 0.0607, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 1718.21875, "completions/mean_terminated_length": 805.4108276367188, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.2852473855018616, "epoch": 4.423684210526316, "frac_reward_zero_std": 0.46875, "grad_norm": 566.0900268554688, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 555183295.0, "reward": 0.8038582801818848, "reward_std": 0.10909964144229889, "rewards/progression_diversity/mean": -0.019448932260274887, "rewards/progression_diversity/std": 0.08251741528511047, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.93798828125, "rewards/symbolic_reward_partial_score/std": 0.2074093222618103, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0440826416015625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.4236390590667725, "step": 1681 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2848607301712036, "epoch": 4.426315789473684, "grad_norm": 0.014388111419975758, "learning_rate": 1e-06, "loss": 0.0661, "step": 1682 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.279374361038208, "epoch": 4.428947368421053, "grad_norm": 0.014383941888809204, "learning_rate": 1e-06, "loss": 0.0695, "step": 1683 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2949568033218384, "epoch": 4.431578947368421, "grad_norm": 0.015598650090396404, "learning_rate": 1e-06, "loss": 0.0831, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15587.0, "completions/mean_length": 1525.705078125, "completions/mean_terminated_length": 762.958984375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.300376757979393, "epoch": 4.434210526315789, "frac_reward_zero_std": 0.3125, "grad_norm": 411.582275390625, "learning_rate": 1e-06, "loss": 0.0576, "num_tokens": 556383464.0, "reward": 0.8135613203048706, "reward_std": 0.15236347913742065, "rewards/progression_diversity/mean": -0.015942061319947243, "rewards/progression_diversity/std": 0.07582908123731613, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9415689706802368, "rewards/symbolic_reward_partial_score/std": 0.20094631612300873, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0429538488388062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.6773757934570312, "step": 1685 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2941613495349884, "epoch": 4.436842105263158, "grad_norm": 0.00848439708352089, "learning_rate": 1e-06, "loss": 0.0689, "step": 1686 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2935366630554199, "epoch": 4.439473684210526, "grad_norm": 6.972188949584961, "learning_rate": 1e-06, "loss": 0.0504, "step": 1687 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2752288281917572, "epoch": 4.442105263157894, "grad_norm": 0.005782074760645628, "learning_rate": 1e-06, "loss": 0.1494, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1201.7578125, "completions/mean_terminated_length": 743.5411987304688, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.29591110348701477, "epoch": 4.4447368421052635, "frac_reward_zero_std": 0.375, "grad_norm": 890.4512939453125, "learning_rate": 1e-06, "loss": 0.0993, "num_tokens": 557391724.0, "reward": 0.8423638343811035, "reward_std": 0.1323879361152649, "rewards/progression_diversity/mean": -0.011668046936392784, "rewards/progression_diversity/std": 0.06880556792020798, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9638671875, "rewards/symbolic_reward_partial_score/std": 0.1620253175497055, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0433096885681152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.715100049972534, "step": 1689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3018334209918976, "epoch": 4.447368421052632, "grad_norm": 0.013348652981221676, "learning_rate": 1e-06, "loss": 0.0512, "step": 1690 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2954847514629364, "epoch": 4.45, "grad_norm": 0.00606268085539341, "learning_rate": 1e-06, "loss": 0.0952, "step": 1691 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30556538701057434, "epoch": 4.4526315789473685, "grad_norm": 0.008453385904431343, "learning_rate": 1e-06, "loss": 0.0245, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 1361.220703125, "completions/mean_terminated_length": 718.6986083984375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.2938489019870758, "epoch": 4.455263157894737, "frac_reward_zero_std": 0.4375, "grad_norm": 246.77127075195312, "learning_rate": 1e-06, "loss": 0.0901, "num_tokens": 558478461.0, "reward": 0.8210785984992981, "reward_std": 0.12946206331253052, "rewards/progression_diversity/mean": -0.011284667067229748, "rewards/progression_diversity/std": 0.06149439141154289, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9495442509651184, "rewards/symbolic_reward_partial_score/std": 0.181060791015625, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046877145767212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.4112725257873535, "step": 1693 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3013473302125931, "epoch": 4.457894736842105, "grad_norm": 0.0174638070166111, "learning_rate": 1e-06, "loss": 0.0643, "step": 1694 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2852264940738678, "epoch": 4.4605263157894735, "grad_norm": 0.011137026362121105, "learning_rate": 1e-06, "loss": 0.1265, "step": 1695 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30904076993465424, "epoch": 4.463157894736842, "grad_norm": 0.007532245479524136, "learning_rate": 1e-06, "loss": 0.0096, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3318.0, "completions/mean_length": 1733.322265625, "completions/mean_terminated_length": 853.6749877929688, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.275277316570282, "epoch": 4.465789473684211, "frac_reward_zero_std": 0.5, "grad_norm": 413.572509765625, "learning_rate": 1e-06, "loss": 0.1591, "num_tokens": 559778914.0, "reward": 0.8111288547515869, "reward_std": 0.0881713479757309, "rewards/progression_diversity/mean": -0.015045925043523312, "rewards/progression_diversity/std": 0.06671184301376343, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.93212890625, "rewards/symbolic_reward_partial_score/std": 0.21600748598575592, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0425660610198975, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.744896650314331, "step": 1697 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2827970087528229, "epoch": 4.468421052631579, "grad_norm": 0.008820455521345139, "learning_rate": 1e-06, "loss": 0.0781, "step": 1698 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.27910783886909485, "epoch": 4.471052631578948, "grad_norm": 0.009887280873954296, "learning_rate": 1e-06, "loss": 0.0669, "step": 1699 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2814963161945343, "epoch": 4.473684210526316, "grad_norm": 0.011021401733160019, "learning_rate": 1e-06, "loss": 0.083, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 1051.0390625, "completions/mean_terminated_length": 745.6016235351562, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.29189206659793854, "epoch": 4.476315789473684, "frac_reward_zero_std": 0.53125, "grad_norm": 435.6475830078125, "learning_rate": 1e-06, "loss": 0.0678, "num_tokens": 560717846.0, "reward": 0.8404079675674438, "reward_std": 0.11957277357578278, "rewards/progression_diversity/mean": -0.007058565504848957, "rewards/progression_diversity/std": 0.05117020010948181, "rewards/symbolic_reward_accuracy/mean": 0.919921875, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.9676106572151184, "rewards/symbolic_reward_partial_score/std": 0.13442426919937134, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0530710220336914, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 2.474128484725952, "step": 1701 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2915034145116806, "epoch": 4.478947368421053, "grad_norm": 0.009802755899727345, "learning_rate": 1e-06, "loss": 0.0604, "step": 1702 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30035799741744995, "epoch": 4.481578947368421, "grad_norm": 0.007560526020824909, "learning_rate": 1e-06, "loss": 0.0481, "step": 1703 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3007005453109741, "epoch": 4.484210526315789, "grad_norm": 0.007429624907672405, "learning_rate": 1e-06, "loss": 0.0331, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1491.83984375, "completions/mean_terminated_length": 791.3905639648438, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.2913738787174225, "epoch": 4.4868421052631575, "frac_reward_zero_std": 0.34375, "grad_norm": 304.2225341796875, "learning_rate": 1e-06, "loss": 0.0793, "num_tokens": 561894820.0, "reward": 0.8073344230651855, "reward_std": 0.1524999737739563, "rewards/progression_diversity/mean": -0.013631231151521206, "rewards/progression_diversity/std": 0.06903047114610672, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9435221552848816, "rewards/symbolic_reward_partial_score/std": 0.19161830842494965, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0431602001190186, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 368.0, "sampling/sampling_logp_difference/mean": 3.739988088607788, "step": 1705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2814719080924988, "epoch": 4.489473684210527, "grad_norm": 0.014499134384095669, "learning_rate": 1e-06, "loss": 0.1048, "step": 1706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.294638067483902, "epoch": 4.492105263157895, "grad_norm": 0.01003054529428482, "learning_rate": 1e-06, "loss": 0.0582, "step": 1707 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2986578643321991, "epoch": 4.494736842105263, "grad_norm": 0.014435573481023312, "learning_rate": 1e-06, "loss": 0.0615, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 1135.01953125, "completions/mean_terminated_length": 769.0440063476562, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.3132117986679077, "epoch": 4.497368421052632, "frac_reward_zero_std": 0.59375, "grad_norm": 0.014912238344550133, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 562862734.0, "reward": 0.832605242729187, "reward_std": 0.09201730787754059, "rewards/progression_diversity/mean": -0.006079651415348053, "rewards/progression_diversity/std": 0.044652536511421204, "rewards/symbolic_reward_accuracy/mean": 0.912109375, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.95849609375, "rewards/symbolic_reward_partial_score/std": 0.15315403044223785, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052751064300537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 370.0, "sampling/sampling_logp_difference/mean": 2.5634231567382812, "step": 1709 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3069326877593994, "epoch": 4.5, "grad_norm": 0.00920610036700964, "learning_rate": 1e-06, "loss": 0.0197, "step": 1710 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.28757910430431366, "epoch": 4.502631578947368, "grad_norm": 0.006706486456096172, "learning_rate": 1e-06, "loss": 0.1082, "step": 1711 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.29572418332099915, "epoch": 4.505263157894737, "grad_norm": 0.008948331698775291, "learning_rate": 1e-06, "loss": 0.0682, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3745.0, "completions/mean_length": 1313.30078125, "completions/mean_terminated_length": 795.7212524414062, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.29290150105953217, "epoch": 4.507894736842105, "frac_reward_zero_std": 0.59375, "grad_norm": 150.0733184814453, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 563937928.0, "reward": 0.8227621912956238, "reward_std": 0.09591831266880035, "rewards/progression_diversity/mean": -0.008939700201153755, "rewards/progression_diversity/std": 0.055079780519008636, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.958984375, "rewards/symbolic_reward_partial_score/std": 0.14947035908699036, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0529078245162964, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 370.0, "sampling/sampling_logp_difference/mean": 2.372925281524658, "step": 1713 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3026455044746399, "epoch": 4.510526315789473, "grad_norm": 0.013394175097346306, "learning_rate": 1e-06, "loss": 0.0088, "step": 1714 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.29875096678733826, "epoch": 4.5131578947368425, "grad_norm": 0.007891521789133549, "learning_rate": 1e-06, "loss": 0.0854, "step": 1715 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.29000018537044525, "epoch": 4.515789473684211, "grad_norm": 0.010807064361870289, "learning_rate": 1e-06, "loss": 0.0695, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3374.0, "completions/mean_length": 1286.517578125, "completions/mean_terminated_length": 830.859130859375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.3059823364019394, "epoch": 4.518421052631579, "frac_reward_zero_std": 0.65625, "grad_norm": 41.614864349365234, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 564988241.0, "reward": 0.8157514333724976, "reward_std": 0.08176864683628082, "rewards/progression_diversity/mean": -0.006893161218613386, "rewards/progression_diversity/std": 0.0425245501101017, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.943359375, "rewards/symbolic_reward_partial_score/std": 0.18591801822185516, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0574617385864258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 370.0, "sampling/sampling_logp_difference/mean": 1.9077733755111694, "step": 1717 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2892063856124878, "epoch": 4.521052631578947, "grad_norm": 0.005814536940306425, "learning_rate": 1e-06, "loss": 0.0526, "step": 1718 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29770365357398987, "epoch": 4.523684210526316, "grad_norm": 0.021579096093773842, "learning_rate": 1e-06, "loss": 0.0965, "step": 1719 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.30083292722702026, "epoch": 4.526315789473684, "grad_norm": 0.016088807955384254, "learning_rate": 1e-06, "loss": 0.0295, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 1413.296875, "completions/mean_terminated_length": 899.1515502929688, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.29098886251449585, "epoch": 4.528947368421052, "frac_reward_zero_std": 0.5625, "grad_norm": 66.23799896240234, "learning_rate": 1e-06, "loss": 0.0473, "num_tokens": 566137193.0, "reward": 0.8391268253326416, "reward_std": 0.11196374893188477, "rewards/progression_diversity/mean": -0.008220801129937172, "rewards/progression_diversity/std": 0.05032823607325554, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9562174081802368, "rewards/symbolic_reward_partial_score/std": 0.18327513337135315, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0517854690551758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 2.562303304672241, "step": 1721 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2914315462112427, "epoch": 4.531578947368421, "grad_norm": 7.667601585388184, "learning_rate": 1e-06, "loss": 0.043, "step": 1722 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2990102767944336, "epoch": 4.534210526315789, "grad_norm": 0.007618363946676254, "learning_rate": 1e-06, "loss": 0.0445, "step": 1723 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.29170943796634674, "epoch": 4.536842105263158, "grad_norm": 0.028170811012387276, "learning_rate": 1e-06, "loss": 0.0476, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3794.0, "completions/mean_length": 1380.935546875, "completions/mean_terminated_length": 834.2651977539062, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.2995843440294266, "epoch": 4.5394736842105265, "frac_reward_zero_std": 0.5625, "grad_norm": 356.0476379394531, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 567248200.0, "reward": 0.8191208243370056, "reward_std": 0.08875949680805206, "rewards/progression_diversity/mean": -0.006864731200039387, "rewards/progression_diversity/std": 0.0402359738945961, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9597981572151184, "rewards/symbolic_reward_partial_score/std": 0.14745070040225983, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0555503368377686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 2.510345220565796, "step": 1725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30750222504138947, "epoch": 4.542105263157895, "grad_norm": 0.007456324994564056, "learning_rate": 1e-06, "loss": 0.0425, "step": 1726 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29517829418182373, "epoch": 4.544736842105263, "grad_norm": 0.017573416233062744, "learning_rate": 1e-06, "loss": 0.0846, "step": 1727 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3013046234846115, "epoch": 4.5473684210526315, "grad_norm": 0.003968199715018272, "learning_rate": 1e-06, "loss": 0.0405, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 1314.697265625, "completions/mean_terminated_length": 859.8892822265625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.2868567556142807, "epoch": 4.55, "frac_reward_zero_std": 0.5625, "grad_norm": 259.5559387207031, "learning_rate": 1e-06, "loss": 0.0893, "num_tokens": 568328301.0, "reward": 0.8367007374763489, "reward_std": 0.10205796360969543, "rewards/progression_diversity/mean": -0.006686838809400797, "rewards/progression_diversity/std": 0.0429992638528347, "rewards/symbolic_reward_accuracy/mean": 0.919921875, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.9591470956802368, "rewards/symbolic_reward_partial_score/std": 0.17069299519062042, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058121681213379, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 2.003477096557617, "step": 1729 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.29739847779273987, "epoch": 4.552631578947368, "grad_norm": 0.022392338141798973, "learning_rate": 1e-06, "loss": 0.0507, "step": 1730 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.30387774109840393, "epoch": 4.5552631578947365, "grad_norm": 0.00769463088363409, "learning_rate": 1e-06, "loss": 0.0124, "step": 1731 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2966042011976242, "epoch": 4.557894736842105, "grad_norm": 0.012574239633977413, "learning_rate": 1e-06, "loss": 0.0445, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 1427.546875, "completions/mean_terminated_length": 851.1318359375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.30412548780441284, "epoch": 4.560526315789474, "frac_reward_zero_std": 0.59375, "grad_norm": 225.99496459960938, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 569455493.0, "reward": 0.826937198638916, "reward_std": 0.10923168063163757, "rewards/progression_diversity/mean": -0.006476983428001404, "rewards/progression_diversity/std": 0.03768179193139076, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9480794072151184, "rewards/symbolic_reward_partial_score/std": 0.20172148942947388, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0539195537567139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 2.804877281188965, "step": 1733 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2931108772754669, "epoch": 4.563157894736842, "grad_norm": 0.008534945547580719, "learning_rate": 1e-06, "loss": 0.0567, "step": 1734 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.27719876170158386, "epoch": 4.565789473684211, "grad_norm": 0.015061067417263985, "learning_rate": 1e-06, "loss": 0.1137, "step": 1735 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2918764352798462, "epoch": 4.568421052631579, "grad_norm": 0.010875429026782513, "learning_rate": 1e-06, "loss": 0.0713, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 957.4921875, "completions/mean_terminated_length": 805.3569946289062, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.31101664900779724, "epoch": 4.571052631578947, "frac_reward_zero_std": 0.6875, "grad_norm": 0.010922472923994064, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 570348193.0, "reward": 0.875369131565094, "reward_std": 0.06468972563743591, "rewards/progression_diversity/mean": -0.0021541740279644728, "rewards/progression_diversity/std": 0.023635899648070335, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9837239980697632, "rewards/symbolic_reward_partial_score/std": 0.10963507741689682, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0634751319885254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 1.4045908451080322, "step": 1737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.310005784034729, "epoch": 4.573684210526316, "grad_norm": 0.012388059869408607, "learning_rate": 1e-06, "loss": -0.0014, "step": 1738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.30088475346565247, "epoch": 4.576315789473684, "grad_norm": 0.002842206507921219, "learning_rate": 1e-06, "loss": 0.0399, "step": 1739 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3011449873447418, "epoch": 4.578947368421053, "grad_norm": 0.03648176044225693, "learning_rate": 1e-06, "loss": 0.087, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 1486.138671875, "completions/mean_terminated_length": 911.9817504882812, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.2893022149801254, "epoch": 4.581578947368421, "frac_reward_zero_std": 0.5, "grad_norm": 198.0384521484375, "learning_rate": 1e-06, "loss": 0.0774, "num_tokens": 571519400.0, "reward": 0.8131746053695679, "reward_std": 0.091860830783844, "rewards/progression_diversity/mean": -0.005787876434624195, "rewards/progression_diversity/std": 0.035509541630744934, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.95166015625, "rewards/symbolic_reward_partial_score/std": 0.1652219146490097, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0542500019073486, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 2.839118003845215, "step": 1741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.28446468710899353, "epoch": 4.58421052631579, "grad_norm": 0.019028333947062492, "learning_rate": 1e-06, "loss": 0.1049, "step": 1742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2950809597969055, "epoch": 4.586842105263158, "grad_norm": 0.010133459232747555, "learning_rate": 1e-06, "loss": 0.0021, "step": 1743 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.28551898896694183, "epoch": 4.589473684210526, "grad_norm": 0.449716717004776, "learning_rate": 1e-06, "loss": 0.0884, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4694.0, "completions/mean_length": 1332.86328125, "completions/mean_terminated_length": 1002.399169921875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.29140228033065796, "epoch": 4.592105263157895, "frac_reward_zero_std": 0.59375, "grad_norm": 71.57343292236328, "learning_rate": 1e-06, "loss": 0.0489, "num_tokens": 572641506.0, "reward": 0.8343420624732971, "reward_std": 0.097034752368927, "rewards/progression_diversity/mean": -0.0032937643118202686, "rewards/progression_diversity/std": 0.02761550061404705, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.958984375, "rewards/symbolic_reward_partial_score/std": 0.16060270369052887, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0581992864608765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 372.0, "sampling/sampling_logp_difference/mean": 1.9309183359146118, "step": 1745 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.297273188829422, "epoch": 4.594736842105263, "grad_norm": 0.005121263209730387, "learning_rate": 1e-06, "loss": 0.0292, "step": 1746 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.27994534373283386, "epoch": 4.597368421052631, "grad_norm": 0.008685395121574402, "learning_rate": 1e-06, "loss": 0.0841, "step": 1747 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29429104924201965, "epoch": 4.6, "grad_norm": 0.004201680421829224, "learning_rate": 1e-06, "loss": 0.018, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 1609.158203125, "completions/mean_terminated_length": 1008.5548706054688, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.27888044714927673, "epoch": 4.602631578947369, "frac_reward_zero_std": 0.65625, "grad_norm": 37.15717697143555, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 573900403.0, "reward": 0.8299179673194885, "reward_std": 0.08020342141389847, "rewards/progression_diversity/mean": -0.006251877639442682, "rewards/progression_diversity/std": 0.038109783083200455, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.9501953125, "rewards/symbolic_reward_partial_score/std": 0.183476984500885, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0542418956756592, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 374.0, "sampling/sampling_logp_difference/mean": 2.3596372604370117, "step": 1749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.27801230549812317, "epoch": 4.605263157894737, "grad_norm": 213.15374755859375, "learning_rate": 1e-06, "loss": 0.1129, "step": 1750 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2716394513845444, "epoch": 4.6078947368421055, "grad_norm": 0.006218337453901768, "learning_rate": 1e-06, "loss": 0.1002, "step": 1751 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2844415009021759, "epoch": 4.610526315789474, "grad_norm": 0.013139521703124046, "learning_rate": 1e-06, "loss": 0.0223, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 1402.400390625, "completions/mean_terminated_length": 919.1229858398438, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.2805843651294708, "epoch": 4.613157894736842, "frac_reward_zero_std": 0.625, "grad_norm": 392.7408142089844, "learning_rate": 1e-06, "loss": 0.0717, "num_tokens": 575023424.0, "reward": 0.8217633366584778, "reward_std": 0.07674242556095123, "rewards/progression_diversity/mean": -0.00628986582159996, "rewards/progression_diversity/std": 0.040007736533880234, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9444986581802368, "rewards/symbolic_reward_partial_score/std": 0.19026826322078705, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0610547065734863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 376.0, "sampling/sampling_logp_difference/mean": 1.5220887660980225, "step": 1753 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2966681718826294, "epoch": 4.61578947368421, "grad_norm": 0.011541535146534443, "learning_rate": 1e-06, "loss": 0.0328, "step": 1754 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2979959547519684, "epoch": 4.618421052631579, "grad_norm": 0.007087912876158953, "learning_rate": 1e-06, "loss": 0.0277, "step": 1755 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29420797526836395, "epoch": 4.621052631578947, "grad_norm": 0.012560022063553333, "learning_rate": 1e-06, "loss": 0.0552, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4188.0, "completions/mean_length": 1243.05859375, "completions/mean_terminated_length": 910.6227416992188, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.3041873723268509, "epoch": 4.623684210526315, "frac_reward_zero_std": 0.59375, "grad_norm": 191.67491149902344, "learning_rate": 1e-06, "loss": 0.0741, "num_tokens": 576051486.0, "reward": 0.8431257009506226, "reward_std": 0.0861952155828476, "rewards/progression_diversity/mean": -0.0038361886981874704, "rewards/progression_diversity/std": 0.03068256378173828, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9654947519302368, "rewards/symbolic_reward_partial_score/std": 0.15342977643013, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0610965490341187, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 376.0, "sampling/sampling_logp_difference/mean": 1.342087984085083, "step": 1757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2971126586198807, "epoch": 4.626315789473685, "grad_norm": 0.01794944889843464, "learning_rate": 1e-06, "loss": 0.0504, "step": 1758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.305801659822464, "epoch": 4.628947368421053, "grad_norm": 0.012613825500011444, "learning_rate": 1e-06, "loss": 0.0083, "step": 1759 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30587300658226013, "epoch": 4.631578947368421, "grad_norm": 0.022545624524354935, "learning_rate": 1e-06, "loss": 0.0683, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3825.0, "completions/mean_length": 1357.8828125, "completions/mean_terminated_length": 935.4617919921875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.30523401498794556, "epoch": 4.63421052631579, "frac_reward_zero_std": 0.71875, "grad_norm": 0.04670589789748192, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 577145698.0, "reward": 0.8240532279014587, "reward_std": 0.07884141802787781, "rewards/progression_diversity/mean": -0.006792136933654547, "rewards/progression_diversity/std": 0.047225385904312134, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9475911259651184, "rewards/symbolic_reward_partial_score/std": 0.18192888796329498, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0601003170013428, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 376.0, "sampling/sampling_logp_difference/mean": 1.485578179359436, "step": 1761 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29327723383903503, "epoch": 4.636842105263158, "grad_norm": 0.01936684362590313, "learning_rate": 1e-06, "loss": 0.0702, "step": 1762 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.29366083443164825, "epoch": 4.639473684210526, "grad_norm": 0.010277163237333298, "learning_rate": 1e-06, "loss": 0.0261, "step": 1763 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.29086172580718994, "epoch": 4.6421052631578945, "grad_norm": 0.008378474041819572, "learning_rate": 1e-06, "loss": 0.0463, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 1404.08203125, "completions/mean_terminated_length": 889.6202392578125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.30439208447933197, "epoch": 4.644736842105263, "frac_reward_zero_std": 0.59375, "grad_norm": 203.2693328857422, "learning_rate": 1e-06, "loss": 0.0463, "num_tokens": 578260716.0, "reward": 0.8370853662490845, "reward_std": 0.09505656361579895, "rewards/progression_diversity/mean": -0.007285004947334528, "rewards/progression_diversity/std": 0.046391766518354416, "rewards/symbolic_reward_accuracy/mean": 0.91796875, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.9656575322151184, "rewards/symbolic_reward_partial_score/std": 0.1476442664861679, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0571269989013672, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 378.0, "sampling/sampling_logp_difference/mean": 2.06299090385437, "step": 1765 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29458560049533844, "epoch": 4.647368421052631, "grad_norm": 0.011282769963145256, "learning_rate": 1e-06, "loss": 0.0713, "step": 1766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2905200123786926, "epoch": 4.65, "grad_norm": 5.091102600097656, "learning_rate": 1e-06, "loss": 0.0594, "step": 1767 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2920839935541153, "epoch": 4.652631578947369, "grad_norm": 0.00980228092521429, "learning_rate": 1e-06, "loss": 0.0531, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12152.0, "completions/mean_length": 1461.2734375, "completions/mean_terminated_length": 979.8951416015625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.3031252771615982, "epoch": 4.655263157894737, "frac_reward_zero_std": 0.59375, "grad_norm": 0.020676128566265106, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 579421880.0, "reward": 0.8634036183357239, "reward_std": 0.1007419154047966, "rewards/progression_diversity/mean": -0.007299537770450115, "rewards/progression_diversity/std": 0.04725031182169914, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9700520634651184, "rewards/symbolic_reward_partial_score/std": 0.16211473941802979, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0525739192962646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 2.551374912261963, "step": 1769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.28818726539611816, "epoch": 4.657894736842105, "grad_norm": 0.013333577662706375, "learning_rate": 1e-06, "loss": 0.0613, "step": 1770 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2797438055276871, "epoch": 4.660526315789474, "grad_norm": 12.496295928955078, "learning_rate": 1e-06, "loss": 0.089, "step": 1771 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29146166145801544, "epoch": 4.663157894736842, "grad_norm": 7.178304672241211, "learning_rate": 1e-06, "loss": 0.0752, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 1521.27734375, "completions/mean_terminated_length": 1072.7042236328125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.28964005410671234, "epoch": 4.66578947368421, "frac_reward_zero_std": 0.4375, "grad_norm": 328.8640441894531, "learning_rate": 1e-06, "loss": 0.1406, "num_tokens": 580629446.0, "reward": 0.8446449041366577, "reward_std": 0.14509400725364685, "rewards/progression_diversity/mean": -0.008165711537003517, "rewards/progression_diversity/std": 0.05180385336279869, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, "rewards/symbolic_reward_partial_score/std": 0.16547498106956482, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0513108968734741, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 2.6449103355407715, "step": 1773 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2889109253883362, "epoch": 4.668421052631579, "grad_norm": 0.018843237310647964, "learning_rate": 1e-06, "loss": 0.048, "step": 1774 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.28207381069660187, "epoch": 4.671052631578947, "grad_norm": 0.029016956686973572, "learning_rate": 1e-06, "loss": 0.0446, "step": 1775 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.28926801681518555, "epoch": 4.673684210526316, "grad_norm": 0.013139498420059681, "learning_rate": 1e-06, "loss": 0.0746, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 1161.837890625, "completions/mean_terminated_length": 858.6076049804688, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.2987944483757019, "epoch": 4.676315789473684, "frac_reward_zero_std": 0.65625, "grad_norm": 314.06134033203125, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 581621939.0, "reward": 0.8670340776443481, "reward_std": 0.08653353154659271, "rewards/progression_diversity/mean": -0.005582699552178383, "rewards/progression_diversity/std": 0.042026419192552567, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9788411259651184, "rewards/symbolic_reward_partial_score/std": 0.12187561392784119, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05836021900177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 2.0134835243225098, "step": 1777 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.312513142824173, "epoch": 4.678947368421053, "grad_norm": 0.004544838331639767, "learning_rate": 1e-06, "loss": -0.0029, "step": 1778 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.28765228390693665, "epoch": 4.681578947368421, "grad_norm": 0.00618336908519268, "learning_rate": 1e-06, "loss": 0.0752, "step": 1779 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29780954122543335, "epoch": 4.684210526315789, "grad_norm": 0.003846309846267104, "learning_rate": 1e-06, "loss": 0.0645, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 1343.55078125, "completions/mean_terminated_length": 858.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.29853370785713196, "epoch": 4.686842105263158, "frac_reward_zero_std": 0.65625, "grad_norm": 134.65643310546875, "learning_rate": 1e-06, "loss": 0.0479, "num_tokens": 582688941.0, "reward": 0.8453360795974731, "reward_std": 0.08855551481246948, "rewards/progression_diversity/mean": -0.007415060419589281, "rewards/progression_diversity/std": 0.04754326492547989, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9573568105697632, "rewards/symbolic_reward_partial_score/std": 0.1826906055212021, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0548827648162842, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 2.597440242767334, "step": 1781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30938100814819336, "epoch": 4.689473684210526, "grad_norm": 0.007459544111043215, "learning_rate": 1e-06, "loss": 0.0288, "step": 1782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2973015457391739, "epoch": 4.692105263157895, "grad_norm": 0.013295495882630348, "learning_rate": 1e-06, "loss": 0.0913, "step": 1783 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.29738104343414307, "epoch": 4.6947368421052635, "grad_norm": 0.008674710988998413, "learning_rate": 1e-06, "loss": 0.1016, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8991.0, "completions/mean_length": 1652.69921875, "completions/mean_terminated_length": 896.4722900390625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.2916813790798187, "epoch": 4.697368421052632, "frac_reward_zero_std": 0.53125, "grad_norm": 311.70086669921875, "learning_rate": 1e-06, "loss": 0.1287, "num_tokens": 583923411.0, "reward": 0.816964864730835, "reward_std": 0.0918387770652771, "rewards/progression_diversity/mean": -0.00762150390073657, "rewards/progression_diversity/std": 0.04115252196788788, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.9454752802848816, "rewards/symbolic_reward_partial_score/std": 0.191688671708107, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0515938997268677, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 2.775969982147217, "step": 1785 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2890394926071167, "epoch": 4.7, "grad_norm": 0.01550104096531868, "learning_rate": 1e-06, "loss": 0.1227, "step": 1786 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2989995628595352, "epoch": 4.7026315789473685, "grad_norm": 0.008528691716492176, "learning_rate": 1e-06, "loss": 0.0246, "step": 1787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29257068037986755, "epoch": 4.705263157894737, "grad_norm": 0.010270673781633377, "learning_rate": 1e-06, "loss": 0.0584, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 1359.19140625, "completions/mean_terminated_length": 905.726318359375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.3098142296075821, "epoch": 4.707894736842105, "frac_reward_zero_std": 0.625, "grad_norm": 0.009268060326576233, "learning_rate": 1e-06, "loss": -0.0146, "num_tokens": 585016949.0, "reward": 0.8493025302886963, "reward_std": 0.07921440899372101, "rewards/progression_diversity/mean": -0.006267758086323738, "rewards/progression_diversity/std": 0.039985544979572296, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.9646809697151184, "rewards/symbolic_reward_partial_score/std": 0.15783125162124634, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058639645576477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 1.931031346321106, "step": 1789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3012120723724365, "epoch": 4.7105263157894735, "grad_norm": 0.018348556011915207, "learning_rate": 1e-06, "loss": 0.0325, "step": 1790 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3086209297180176, "epoch": 4.713157894736842, "grad_norm": 0.03524219989776611, "learning_rate": 1e-06, "loss": 0.0427, "step": 1791 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.29122862219810486, "epoch": 4.715789473684211, "grad_norm": 0.004257425665855408, "learning_rate": 1e-06, "loss": 0.1065, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3446.0, "completions/mean_length": 1595.56640625, "completions/mean_terminated_length": 899.9959106445312, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.287975549697876, "epoch": 4.718421052631579, "frac_reward_zero_std": 0.5, "grad_norm": 392.2756042480469, "learning_rate": 1e-06, "loss": 0.1354, "num_tokens": 586220343.0, "reward": 0.8267496824264526, "reward_std": 0.13566027581691742, "rewards/progression_diversity/mean": -0.010585745796561241, "rewards/progression_diversity/std": 0.05752396956086159, "rewards/symbolic_reward_accuracy/mean": 0.912109375, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.9462890625, "rewards/symbolic_reward_partial_score/std": 0.20108237862586975, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048526644706726, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 380.0, "sampling/sampling_logp_difference/mean": 3.1703460216522217, "step": 1793 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29304662346839905, "epoch": 4.721052631578948, "grad_norm": 282.28704833984375, "learning_rate": 1e-06, "loss": 0.1636, "step": 1794 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29584261775016785, "epoch": 4.723684210526316, "grad_norm": 0.01929856278002262, "learning_rate": 1e-06, "loss": 0.1043, "step": 1795 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30293792486190796, "epoch": 4.726315789473684, "grad_norm": 0.007616270799189806, "learning_rate": 1e-06, "loss": 0.0534, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3592.0, "completions/mean_length": 1216.298828125, "completions/mean_terminated_length": 914.1534423828125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.2999269813299179, "epoch": 4.728947368421053, "frac_reward_zero_std": 0.65625, "grad_norm": 63.879966735839844, "learning_rate": 1e-06, "loss": 0.0632, "num_tokens": 587246896.0, "reward": 0.8688350915908813, "reward_std": 0.0851588174700737, "rewards/progression_diversity/mean": -0.0061400942504405975, "rewards/progression_diversity/std": 0.04974029213190079, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.97705078125, "rewards/symbolic_reward_partial_score/std": 0.13179610669612885, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0599864721298218, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 382.0, "sampling/sampling_logp_difference/mean": 1.3513113260269165, "step": 1797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30788634717464447, "epoch": 4.731578947368421, "grad_norm": 0.03443191573023796, "learning_rate": 1e-06, "loss": 0.0174, "step": 1798 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2986367344856262, "epoch": 4.734210526315789, "grad_norm": 0.0047141476534307, "learning_rate": 1e-06, "loss": 0.6458, "step": 1799 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.29760782420635223, "epoch": 4.7368421052631575, "grad_norm": 0.0066524529829621315, "learning_rate": 1e-06, "loss": 0.0348, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 2149.83984375, "completions/mean_terminated_length": 1008.70458984375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.28144165873527527, "epoch": 4.739473684210527, "frac_reward_zero_std": 0.5625, "grad_norm": 226.54739379882812, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 588753086.0, "reward": 0.779984712600708, "reward_std": 0.13797515630722046, "rewards/progression_diversity/mean": -0.019111623987555504, "rewards/progression_diversity/std": 0.0777515172958374, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.9261067509651184, "rewards/symbolic_reward_partial_score/std": 0.22488640248775482, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0423637628555298, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 384.0, "sampling/sampling_logp_difference/mean": 3.5376710891723633, "step": 1801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2867640554904938, "epoch": 4.742105263157895, "grad_norm": 0.025515422224998474, "learning_rate": 1e-06, "loss": 0.1148, "step": 1802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.27136462926864624, "epoch": 4.744736842105263, "grad_norm": 0.008495149202644825, "learning_rate": 1e-06, "loss": 0.1537, "step": 1803 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2758960574865341, "epoch": 4.747368421052632, "grad_norm": 0.33327996730804443, "learning_rate": 1e-06, "loss": 0.0768, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9181.0, "completions/mean_length": 1735.49609375, "completions/mean_terminated_length": 1077.80810546875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.2854105681180954, "epoch": 4.75, "frac_reward_zero_std": 0.375, "grad_norm": 10.029664039611816, "learning_rate": 1e-06, "loss": 0.0718, "num_tokens": 590060988.0, "reward": 0.8515157699584961, "reward_std": 0.12311355024576187, "rewards/progression_diversity/mean": -0.014438299462199211, "rewards/progression_diversity/std": 0.07580766081809998, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.9671223759651184, "rewards/symbolic_reward_partial_score/std": 0.15571942925453186, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04230797290802, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 386.0, "sampling/sampling_logp_difference/mean": 3.432555675506592, "step": 1805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2759237736463547, "epoch": 4.752631578947368, "grad_norm": 1959.9281005859375, "learning_rate": 1e-06, "loss": 0.2892, "step": 1806 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.28651775419712067, "epoch": 4.755263157894737, "grad_norm": 352.3994140625, "learning_rate": 1e-06, "loss": 0.1454, "step": 1807 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2970658540725708, "epoch": 4.757894736842105, "grad_norm": 0.0121544124558568, "learning_rate": 1e-06, "loss": 0.0625, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 1934.1640625, "completions/mean_terminated_length": 938.6638793945312, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.27560150623321533, "epoch": 4.760526315789473, "frac_reward_zero_std": 0.40625, "grad_norm": 242.6947479248047, "learning_rate": 1e-06, "loss": 0.1073, "num_tokens": 591441072.0, "reward": 0.8055234551429749, "reward_std": 0.10336482524871826, "rewards/progression_diversity/mean": -0.01895313523709774, "rewards/progression_diversity/std": 0.08070053160190582, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.93896484375, "rewards/symbolic_reward_partial_score/std": 0.19326075911521912, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0425719022750854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 388.0, "sampling/sampling_logp_difference/mean": 3.5746545791625977, "step": 1809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.28119949996471405, "epoch": 4.7631578947368425, "grad_norm": 0.02790587767958641, "learning_rate": 1e-06, "loss": 0.1284, "step": 1810 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2921789288520813, "epoch": 4.765789473684211, "grad_norm": 0.060401126742362976, "learning_rate": 1e-06, "loss": 0.0706, "step": 1811 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.28399544954299927, "epoch": 4.768421052631579, "grad_norm": 0.10208897292613983, "learning_rate": 1e-06, "loss": 0.0961, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1791.84375, "completions/mean_terminated_length": 819.0333862304688, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.2930610924959183, "epoch": 4.771052631578947, "frac_reward_zero_std": 0.5625, "grad_norm": 260.9751892089844, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 592748288.0, "reward": 0.7997059226036072, "reward_std": 0.09009853005409241, "rewards/progression_diversity/mean": -0.014760013669729233, "rewards/progression_diversity/std": 0.06976296752691269, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9298502206802368, "rewards/symbolic_reward_partial_score/std": 0.21666119992733002, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0533522367477417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 388.0, "sampling/sampling_logp_difference/mean": 2.341224431991577, "step": 1813 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29311226308345795, "epoch": 4.773684210526316, "grad_norm": 0.006627992261201143, "learning_rate": 1e-06, "loss": 0.0818, "step": 1814 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2873583137989044, "epoch": 4.776315789473684, "grad_norm": 0.0039825947023928165, "learning_rate": 1e-06, "loss": 0.0784, "step": 1815 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29943425953388214, "epoch": 4.778947368421052, "grad_norm": 0.006021894048899412, "learning_rate": 1e-06, "loss": 0.0369, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 1341.46875, "completions/mean_terminated_length": 949.5791625976562, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.3028171956539154, "epoch": 4.781578947368421, "frac_reward_zero_std": 0.34375, "grad_norm": 35.22563552856445, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 593841616.0, "reward": 0.8557599782943726, "reward_std": 0.1146036684513092, "rewards/progression_diversity/mean": -0.009937912225723267, "rewards/progression_diversity/std": 0.0624278225004673, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.974609375, "rewards/symbolic_reward_partial_score/std": 0.12061332911252975, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052259922027588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 390.0, "sampling/sampling_logp_difference/mean": 2.46126127243042, "step": 1817 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2946004569530487, "epoch": 4.784210526315789, "grad_norm": 524.8899536132812, "learning_rate": 1e-06, "loss": 0.1229, "step": 1818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.29525619745254517, "epoch": 4.786842105263158, "grad_norm": 0.0062756622210145, "learning_rate": 1e-06, "loss": 0.0398, "step": 1819 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2876027226448059, "epoch": 4.7894736842105265, "grad_norm": 0.011834767647087574, "learning_rate": 1e-06, "loss": 0.072, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8261.0, "completions/mean_length": 1432.533203125, "completions/mean_terminated_length": 950.227783203125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.294967457652092, "epoch": 4.792105263157895, "frac_reward_zero_std": 0.59375, "grad_norm": 206.98422241210938, "learning_rate": 1e-06, "loss": 0.0693, "num_tokens": 594979361.0, "reward": 0.8551140427589417, "reward_std": 0.08930052816867828, "rewards/progression_diversity/mean": -0.011060354299843311, "rewards/progression_diversity/std": 0.0651150494813919, "rewards/symbolic_reward_accuracy/mean": 0.9453125, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.97119140625, "rewards/symbolic_reward_partial_score/std": 0.14763346314430237, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0505045652389526, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 392.0, "sampling/sampling_logp_difference/mean": 2.7975821495056152, "step": 1821 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29428502917289734, "epoch": 4.794736842105263, "grad_norm": 0.00786570180207491, "learning_rate": 1e-06, "loss": 0.0222, "step": 1822 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2888794392347336, "epoch": 4.7973684210526315, "grad_norm": 0.015527124516665936, "learning_rate": 1e-06, "loss": 0.0965, "step": 1823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2929443120956421, "epoch": 4.8, "grad_norm": 0.0051057226955890656, "learning_rate": 1e-06, "loss": 0.0823, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5839.0, "completions/mean_length": 1806.5625, "completions/mean_terminated_length": 963.2396240234375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.2864207327365875, "epoch": 4.802631578947368, "frac_reward_zero_std": 0.59375, "grad_norm": 191.23019409179688, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 596299105.0, "reward": 0.7930800914764404, "reward_std": 0.09042888879776001, "rewards/progression_diversity/mean": -0.01328323408961296, "rewards/progression_diversity/std": 0.06475852429866791, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9265950322151184, "rewards/symbolic_reward_partial_score/std": 0.22019346058368683, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0515691041946411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 392.0, "sampling/sampling_logp_difference/mean": 2.668024778366089, "step": 1825 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.26454444229602814, "epoch": 4.8052631578947365, "grad_norm": 0.004221981856971979, "learning_rate": 1e-06, "loss": 0.1982, "step": 1826 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2992611974477768, "epoch": 4.807894736842105, "grad_norm": 0.009702647104859352, "learning_rate": 1e-06, "loss": 0.0082, "step": 1827 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.28555387258529663, "epoch": 4.810526315789474, "grad_norm": 0.007640550844371319, "learning_rate": 1e-06, "loss": 0.0761, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1518.830078125, "completions/mean_terminated_length": 914.5548706054688, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.2895033657550812, "epoch": 4.813157894736842, "frac_reward_zero_std": 0.4375, "grad_norm": 482.5952453613281, "learning_rate": 1e-06, "loss": 0.0888, "num_tokens": 597485546.0, "reward": 0.8472241163253784, "reward_std": 0.12179014086723328, "rewards/progression_diversity/mean": -0.013924511149525642, "rewards/progression_diversity/std": 0.07416489720344543, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.9703775644302368, "rewards/symbolic_reward_partial_score/std": 0.1422656625509262, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0476664304733276, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 392.0, "sampling/sampling_logp_difference/mean": 3.4861273765563965, "step": 1829 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30734431743621826, "epoch": 4.815789473684211, "grad_norm": 0.01676630787551403, "learning_rate": 1e-06, "loss": 0.0315, "step": 1830 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2916211038827896, "epoch": 4.818421052631579, "grad_norm": 0.01112330798059702, "learning_rate": 1e-06, "loss": 0.0759, "step": 1831 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2971077710390091, "epoch": 4.821052631578947, "grad_norm": 0.006531843915581703, "learning_rate": 1e-06, "loss": 0.0779, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 1641.30859375, "completions/mean_terminated_length": 1010.7658081054688, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.2911697328090668, "epoch": 4.823684210526316, "frac_reward_zero_std": 0.53125, "grad_norm": 0.01622670330107212, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 598745384.0, "reward": 0.8167889714241028, "reward_std": 0.10516804456710815, "rewards/progression_diversity/mean": -0.010561157017946243, "rewards/progression_diversity/std": 0.05700741335749626, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9378255605697632, "rewards/symbolic_reward_partial_score/std": 0.2107086479663849, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045763373374939, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 394.0, "sampling/sampling_logp_difference/mean": 3.4746012687683105, "step": 1833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.28375144302845, "epoch": 4.826315789473684, "grad_norm": 0.007259514648467302, "learning_rate": 1e-06, "loss": 0.0821, "step": 1834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.28915947675704956, "epoch": 4.828947368421053, "grad_norm": 0.020610112696886063, "learning_rate": 1e-06, "loss": 0.0623, "step": 1835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2865881621837616, "epoch": 4.831578947368421, "grad_norm": 0.014592897146940231, "learning_rate": 1e-06, "loss": 0.1088, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 1590.58984375, "completions/mean_terminated_length": 957.8778686523438, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.29244448244571686, "epoch": 4.83421052631579, "frac_reward_zero_std": 0.5, "grad_norm": 432.8992919921875, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 599961590.0, "reward": 0.8015085458755493, "reward_std": 0.11123409122228622, "rewards/progression_diversity/mean": -0.010282262228429317, "rewards/progression_diversity/std": 0.05530929192900658, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9435220956802368, "rewards/symbolic_reward_partial_score/std": 0.17923220992088318, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0479365587234497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 396.0, "sampling/sampling_logp_difference/mean": 3.16892409324646, "step": 1837 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.28493815660476685, "epoch": 4.836842105263158, "grad_norm": 0.014859353192150593, "learning_rate": 1e-06, "loss": 0.0579, "step": 1838 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.29050807654857635, "epoch": 4.839473684210526, "grad_norm": 0.00957922451198101, "learning_rate": 1e-06, "loss": 0.0393, "step": 1839 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.28927716612815857, "epoch": 4.842105263157895, "grad_norm": 0.018363086506724358, "learning_rate": 1e-06, "loss": 0.0646, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4263.0, "completions/mean_length": 961.29296875, "completions/mean_terminated_length": 839.8543090820312, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.30787861347198486, "epoch": 4.844736842105263, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09452533721923828, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 600826380.0, "reward": 0.8753113746643066, "reward_std": 0.08123890310525894, "rewards/progression_diversity/mean": -0.0030442136339843273, "rewards/progression_diversity/std": 0.038930658251047134, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.98291015625, "rewards/symbolic_reward_partial_score/std": 0.1046241894364357, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.063698410987854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 394.0, "sampling/sampling_logp_difference/mean": 1.2699187994003296, "step": 1841 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31785525381565094, "epoch": 4.847368421052631, "grad_norm": 36.97111511230469, "learning_rate": 1e-06, "loss": 0.0007, "step": 1842 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3095642328262329, "epoch": 4.85, "grad_norm": 0.019375812262296677, "learning_rate": 1e-06, "loss": 0.0321, "step": 1843 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3243100643157959, "epoch": 4.852631578947369, "grad_norm": 0.00351691129617393, "learning_rate": 1e-06, "loss": -0.0018, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4850.0, "completions/mean_length": 1875.322265625, "completions/mean_terminated_length": 940.2515869140625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.2960711419582367, "epoch": 4.855263157894737, "frac_reward_zero_std": 0.5, "grad_norm": 743.4161376953125, "learning_rate": 1e-06, "loss": 0.0685, "num_tokens": 602193201.0, "reward": 0.8039959669113159, "reward_std": 0.11662513762712479, "rewards/progression_diversity/mean": -0.015446132980287075, "rewards/progression_diversity/std": 0.07263787090778351, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.93310546875, "rewards/symbolic_reward_partial_score/std": 0.21881107985973358, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0418115854263306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 396.0, "sampling/sampling_logp_difference/mean": 4.129971981048584, "step": 1845 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.28108513355255127, "epoch": 4.8578947368421055, "grad_norm": 685.2288818359375, "learning_rate": 1e-06, "loss": 0.2488, "step": 1846 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.28235989809036255, "epoch": 4.860526315789474, "grad_norm": 0.029184581711888313, "learning_rate": 1e-06, "loss": 0.1114, "step": 1847 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.28090812265872955, "epoch": 4.863157894736842, "grad_norm": 0.2511216998100281, "learning_rate": 1e-06, "loss": 0.0608, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 2357.298828125, "completions/mean_terminated_length": 872.8314819335938, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.26737289130687714, "epoch": 4.86578947368421, "frac_reward_zero_std": 0.3125, "grad_norm": 318.45111083984375, "learning_rate": 1e-06, "loss": 0.1068, "num_tokens": 603795082.0, "reward": 0.7866251468658447, "reward_std": 0.13819053769111633, "rewards/progression_diversity/mean": -0.028891239315271378, "rewards/progression_diversity/std": 0.10002173483371735, "rewards/symbolic_reward_accuracy/mean": 0.869140625, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.91015625, "rewards/symbolic_reward_partial_score/std": 0.25601255893707275, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0239213705062866, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 396.0, "sampling/sampling_logp_difference/mean": 6.760751247406006, "step": 1849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2813594937324524, "epoch": 4.868421052631579, "grad_norm": 371.5801086425781, "learning_rate": 1e-06, "loss": 0.1545, "step": 1850 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.25378528982400894, "epoch": 4.871052631578947, "grad_norm": 1.993329405784607, "learning_rate": 1e-06, "loss": 0.2296, "step": 1851 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2722862958908081, "epoch": 4.873684210526315, "grad_norm": 0.014913872815668583, "learning_rate": 1e-06, "loss": 0.1323, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 2810.728515625, "completions/mean_terminated_length": 974.8757934570312, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.2696688771247864, "epoch": 4.876315789473685, "frac_reward_zero_std": 0.34375, "grad_norm": 104.42438507080078, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 605653183.0, "reward": 0.7709691524505615, "reward_std": 0.15861886739730835, "rewards/progression_diversity/mean": -0.03199107199907303, "rewards/progression_diversity/std": 0.09810096770524979, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.8893228769302368, "rewards/symbolic_reward_partial_score/std": 0.2945672869682312, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0273919105529785, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 400.0, "sampling/sampling_logp_difference/mean": 5.841981887817383, "step": 1853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.25676195323467255, "epoch": 4.878947368421053, "grad_norm": 27.663148880004883, "learning_rate": 1e-06, "loss": 0.1787, "step": 1854 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.23987554013729095, "epoch": 4.881578947368421, "grad_norm": 0.01363349985331297, "learning_rate": 1e-06, "loss": 0.2546, "step": 1855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.26036641001701355, "epoch": 4.88421052631579, "grad_norm": 0.026609908789396286, "learning_rate": 1e-06, "loss": 0.0951, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3775.0, "completions/mean_length": 2387.767578125, "completions/mean_terminated_length": 973.0946655273438, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.2557432949542999, "epoch": 4.886842105263158, "frac_reward_zero_std": 0.34375, "grad_norm": 442.3005065917969, "learning_rate": 1e-06, "loss": 0.1959, "num_tokens": 607298216.0, "reward": 0.7776139974594116, "reward_std": 0.15359753370285034, "rewards/progression_diversity/mean": -0.03156745806336403, "rewards/progression_diversity/std": 0.10606934875249863, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.8958333730697632, "rewards/symbolic_reward_partial_score/std": 0.2811386287212372, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0301284790039062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 400.0, "sampling/sampling_logp_difference/mean": 5.681142807006836, "step": 1857 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2727195918560028, "epoch": 4.889473684210526, "grad_norm": 266.2679443359375, "learning_rate": 1e-06, "loss": 0.1627, "step": 1858 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2670164704322815, "epoch": 4.8921052631578945, "grad_norm": 0.010746384039521217, "learning_rate": 1e-06, "loss": 0.1468, "step": 1859 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.27139565348625183, "epoch": 4.894736842105263, "grad_norm": 0.0044360412284731865, "learning_rate": 1e-06, "loss": 0.1136, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 1420.21484375, "completions/mean_terminated_length": 811.9308471679688, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.2882204055786133, "epoch": 4.897368421052631, "frac_reward_zero_std": 0.5, "grad_norm": 773.8041381835938, "learning_rate": 1e-06, "loss": 0.1389, "num_tokens": 608393302.0, "reward": 0.8496336936950684, "reward_std": 0.10768449306488037, "rewards/progression_diversity/mean": -0.017098724842071533, "rewards/progression_diversity/std": 0.08937934786081314, "rewards/symbolic_reward_accuracy/mean": 0.939453125, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, "rewards/symbolic_reward_partial_score/std": 0.16535688936710358, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0443835258483887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 402.0, "sampling/sampling_logp_difference/mean": 3.879685878753662, "step": 1861 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.30971573293209076, "epoch": 4.9, "grad_norm": 0.017872009426355362, "learning_rate": 1e-06, "loss": 0.0584, "step": 1862 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30385591089725494, "epoch": 4.902631578947369, "grad_norm": 0.0137720238417387, "learning_rate": 1e-06, "loss": 0.0668, "step": 1863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.28459571301937103, "epoch": 4.905263157894737, "grad_norm": 0.005599349737167358, "learning_rate": 1e-06, "loss": 0.138, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3664.0, "completions/mean_length": 2037.166015625, "completions/mean_terminated_length": 854.2346801757812, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.29497842490673065, "epoch": 4.907894736842105, "frac_reward_zero_std": 0.46875, "grad_norm": 3.637153148651123, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 609837131.0, "reward": 0.8191713094711304, "reward_std": 0.12532027065753937, "rewards/progression_diversity/mean": -0.03111710585653782, "rewards/progression_diversity/std": 0.11476296186447144, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9334309697151184, "rewards/symbolic_reward_partial_score/std": 0.229754239320755, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0394989252090454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 404.0, "sampling/sampling_logp_difference/mean": 4.205488204956055, "step": 1865 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.28409726917743683, "epoch": 4.910526315789474, "grad_norm": 175.42616271972656, "learning_rate": 1e-06, "loss": 0.1474, "step": 1866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2551625221967697, "epoch": 4.913157894736842, "grad_norm": 11.765541076660156, "learning_rate": 1e-06, "loss": 0.2086, "step": 1867 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2779727131128311, "epoch": 4.91578947368421, "grad_norm": 0.00957444030791521, "learning_rate": 1e-06, "loss": 0.1011, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2560.607421875, "completions/mean_terminated_length": 1031.33837890625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.26823192834854126, "epoch": 4.918421052631579, "frac_reward_zero_std": 0.28125, "grad_norm": 1057.808837890625, "learning_rate": 1e-06, "loss": 0.1326, "num_tokens": 611581346.0, "reward": 0.7715161442756653, "reward_std": 0.2120749056339264, "rewards/progression_diversity/mean": -0.04077008739113808, "rewards/progression_diversity/std": 0.13221383094787598, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.8992512822151184, "rewards/symbolic_reward_partial_score/std": 0.27642568945884705, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0262513160705566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 404.0, "sampling/sampling_logp_difference/mean": 5.402329444885254, "step": 1869 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.27585209906101227, "epoch": 4.921052631578947, "grad_norm": 338.9614562988281, "learning_rate": 1e-06, "loss": 0.1617, "step": 1870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.26490674912929535, "epoch": 4.923684210526316, "grad_norm": 0.07172678411006927, "learning_rate": 1e-06, "loss": 0.1314, "step": 1871 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2522326856851578, "epoch": 4.926315789473684, "grad_norm": 0.006790068931877613, "learning_rate": 1e-06, "loss": 0.2023, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3835.0, "completions/mean_length": 1870.408203125, "completions/mean_terminated_length": 870.5156860351562, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.29680676758289337, "epoch": 4.928947368421053, "frac_reward_zero_std": 0.4375, "grad_norm": 459.56561279296875, "learning_rate": 1e-06, "loss": 0.0501, "num_tokens": 612917779.0, "reward": 0.8070130944252014, "reward_std": 0.12357471138238907, "rewards/progression_diversity/mean": -0.02623012289404869, "rewards/progression_diversity/std": 0.10527035593986511, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9239909052848816, "rewards/symbolic_reward_partial_score/std": 0.24311135709285736, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0434627532958984, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 408.0, "sampling/sampling_logp_difference/mean": 3.528529167175293, "step": 1873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.28874650597572327, "epoch": 4.931578947368421, "grad_norm": 229.14480590820312, "learning_rate": 1e-06, "loss": 0.1112, "step": 1874 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.28308890759944916, "epoch": 4.934210526315789, "grad_norm": 0.023203283548355103, "learning_rate": 1e-06, "loss": 0.1112, "step": 1875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2699621021747589, "epoch": 4.936842105263158, "grad_norm": 0.0074362908490002155, "learning_rate": 1e-06, "loss": 0.1392, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3640.0, "completions/mean_length": 2017.177734375, "completions/mean_terminated_length": 963.0083618164062, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.2770151048898697, "epoch": 4.939473684210526, "frac_reward_zero_std": 0.4375, "grad_norm": 661.4608154296875, "learning_rate": 1e-06, "loss": 0.1113, "num_tokens": 614368398.0, "reward": 0.8179945945739746, "reward_std": 0.1273220181465149, "rewards/progression_diversity/mean": -0.03159845247864723, "rewards/progression_diversity/std": 0.12263655662536621, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9444986581802368, "rewards/symbolic_reward_partial_score/std": 0.2035871148109436, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0411176681518555, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 408.0, "sampling/sampling_logp_difference/mean": 3.9134116172790527, "step": 1877 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2913369834423065, "epoch": 4.942105263157895, "grad_norm": 0.01320668775588274, "learning_rate": 1e-06, "loss": 0.0404, "step": 1878 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.28409533202648163, "epoch": 4.9447368421052635, "grad_norm": 0.03402348980307579, "learning_rate": 1e-06, "loss": 0.1155, "step": 1879 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2711239904165268, "epoch": 4.947368421052632, "grad_norm": 0.008342397399246693, "learning_rate": 1e-06, "loss": 0.1331, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 1834.041015625, "completions/mean_terminated_length": 960.4410400390625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.2792969048023224, "epoch": 4.95, "frac_reward_zero_std": 0.3125, "grad_norm": 335.7244873046875, "learning_rate": 1e-06, "loss": 0.125, "num_tokens": 615720259.0, "reward": 0.842618465423584, "reward_std": 0.1381361484527588, "rewards/progression_diversity/mean": -0.02526368759572506, "rewards/progression_diversity/std": 0.10902338474988937, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.9593098759651184, "rewards/symbolic_reward_partial_score/std": 0.18112175166606903, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0347847938537598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 410.0, "sampling/sampling_logp_difference/mean": 4.509306907653809, "step": 1881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2655886858701706, "epoch": 4.9526315789473685, "grad_norm": 0.01917690970003605, "learning_rate": 1e-06, "loss": 0.1565, "step": 1882 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2937444746494293, "epoch": 4.955263157894737, "grad_norm": 0.027145760133862495, "learning_rate": 1e-06, "loss": 0.0847, "step": 1883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2872210890054703, "epoch": 4.957894736842105, "grad_norm": 0.007199451327323914, "learning_rate": 1e-06, "loss": 0.1322, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4932.0, "completions/mean_length": 2128.708984375, "completions/mean_terminated_length": 953.3255615234375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.2776035964488983, "epoch": 4.9605263157894735, "frac_reward_zero_std": 0.46875, "grad_norm": 354.8702392578125, "learning_rate": 1e-06, "loss": 0.0456, "num_tokens": 617215790.0, "reward": 0.7808091640472412, "reward_std": 0.14809326827526093, "rewards/progression_diversity/mean": -0.029436133801937103, "rewards/progression_diversity/std": 0.1100505068898201, "rewards/symbolic_reward_accuracy/mean": 0.861328125, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.9044596552848816, "rewards/symbolic_reward_partial_score/std": 0.26103639602661133, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038970708847046, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 410.0, "sampling/sampling_logp_difference/mean": 4.3476243019104, "step": 1885 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2802853584289551, "epoch": 4.963157894736842, "grad_norm": 0.026210030540823936, "learning_rate": 1e-06, "loss": 0.1114, "step": 1886 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.292650431394577, "epoch": 4.965789473684211, "grad_norm": 0.017857909202575684, "learning_rate": 1e-06, "loss": 0.0637, "step": 1887 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.258012056350708, "epoch": 4.968421052631579, "grad_norm": 0.031659141182899475, "learning_rate": 1e-06, "loss": 0.1985, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1877.359375, "completions/mean_terminated_length": 1069.773193359375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.28717607259750366, "epoch": 4.971052631578948, "frac_reward_zero_std": 0.34375, "grad_norm": 143.55223083496094, "learning_rate": 1e-06, "loss": 0.0625, "num_tokens": 618611846.0, "reward": 0.8242976665496826, "reward_std": 0.13553351163864136, "rewards/progression_diversity/mean": -0.021410608664155006, "rewards/progression_diversity/std": 0.09337057173252106, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9527994990348816, "rewards/symbolic_reward_partial_score/std": 0.17747651040554047, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0396955013275146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 4.124094009399414, "step": 1889 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2870943546295166, "epoch": 4.973684210526316, "grad_norm": 0.014890086837112904, "learning_rate": 1e-06, "loss": 0.0991, "step": 1890 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.27629394829273224, "epoch": 4.976315789473684, "grad_norm": 0.010494343005120754, "learning_rate": 1e-06, "loss": 0.0717, "step": 1891 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.26271922141313553, "epoch": 4.978947368421053, "grad_norm": 0.014826311729848385, "learning_rate": 1e-06, "loss": 0.1739, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 1878.482421875, "completions/mean_terminated_length": 879.1461791992188, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.30142028629779816, "epoch": 4.981578947368421, "frac_reward_zero_std": 0.4375, "grad_norm": 139.75131225585938, "learning_rate": 1e-06, "loss": 0.0789, "num_tokens": 619963101.0, "reward": 0.8057065010070801, "reward_std": 0.13081881403923035, "rewards/progression_diversity/mean": -0.025057487189769745, "rewards/progression_diversity/std": 0.10004720836877823, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9339193105697632, "rewards/symbolic_reward_partial_score/std": 0.21596600115299225, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0438662767410278, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 4.115793228149414, "step": 1893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.28368599712848663, "epoch": 4.984210526315789, "grad_norm": 0.008541757240891457, "learning_rate": 1e-06, "loss": 0.1416, "step": 1894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2943654954433441, "epoch": 4.9868421052631575, "grad_norm": 0.007597525604069233, "learning_rate": 1e-06, "loss": 0.0159, "step": 1895 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.28492121398448944, "epoch": 4.989473684210527, "grad_norm": 0.008518518880009651, "learning_rate": 1e-06, "loss": 0.0887, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3774.0, "completions/mean_length": 1736.919921875, "completions/mean_terminated_length": 889.568115234375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.30898527801036835, "epoch": 4.992105263157895, "frac_reward_zero_std": 0.53125, "grad_norm": 330.7555847167969, "learning_rate": 1e-06, "loss": 0.0491, "num_tokens": 621237364.0, "reward": 0.8030959963798523, "reward_std": 0.13730832934379578, "rewards/progression_diversity/mean": -0.01755291409790516, "rewards/progression_diversity/std": 0.07998932898044586, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9288737177848816, "rewards/symbolic_reward_partial_score/std": 0.22742776572704315, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0468072891235352, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 3.8619487285614014, "step": 1897 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.29312750697135925, "epoch": 4.994736842105263, "grad_norm": 0.026777606457471848, "learning_rate": 1e-06, "loss": 0.1218, "step": 1898 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30650024116039276, "epoch": 4.997368421052632, "grad_norm": 0.006984531879425049, "learning_rate": 1e-06, "loss": 0.0475, "step": 1899 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.295014426112175, "epoch": 5.0, "grad_norm": 0.007008237764239311, "learning_rate": 1e-06, "loss": 0.1042, "step": 1900 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.015380859375, "eval_completions/max_length": 13712.375, "eval_completions/max_terminated_length": 2437.9375, "eval_completions/mean_length": 905.335693359375, "eval_completions/mean_terminated_length": 664.4008159637451, "eval_completions/min_length": 251.21875, "eval_completions/min_terminated_length": 251.21875, "eval_entropy": 0.31546804029494524, "eval_frac_reward_zero_std": 0.59765625, "eval_loss": 0.018208162859082222, "eval_num_tokens": 621237364.0, "eval_reward": 0.8710558321326971, "eval_reward_std": 0.08327648929116549, "eval_rewards/progression_diversity/mean": -0.00623641712081735, "eval_rewards/progression_diversity/std": 0.04451105743646622, "eval_rewards/symbolic_reward_accuracy/mean": 0.964111328125, "eval_rewards/symbolic_reward_accuracy/std": 0.1680670971982181, "eval_rewards/symbolic_reward_partial_score/mean": 0.9798990897834301, "eval_rewards/symbolic_reward_partial_score/std": 0.10754719079704955, "eval_rewards/tag_count_reward/mean": -0.01318359375, "eval_rewards/tag_count_reward/std": 0.09598715417087078, "eval_runtime": 3296.8572, "eval_samples_per_second": 0.076, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0692463777959347, "eval_sampling/importance_sampling_ratio/min": 7.846449850264574e-05, "eval_sampling/sampling_logp_difference/max": 337.0917568653822, "eval_sampling/sampling_logp_difference/mean": 0.9625979592092335, "eval_steps_per_second": 0.001, "step": 1900 }, { "epoch": 5.0, "step": 1900, "total_flos": 0.0, "train_loss": 0.10312542560923918, "train_runtime": 114002.2359, "train_samples_per_second": 0.134, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 1900, "num_input_tokens_seen": 621237364, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }