[ { "iteration": 0, "condition": "A", "eval/pass@1": 0.7963414634146343, "eval/pass@5": 0.8536585365853658, "eval_mbpp/pass@1": 0.6339999999999999, "eval_mbpp/pass@5": 0.68 }, { "iteration": 1, "condition": "A", "time_min": 793.5286037087441, "train/pass_rate": 0.6029668411867365, "train/mean_reward": 0.630802792321117, "train/reward_std": 0.4778792950119141, "train/informative_groups": 132, "train/num_groups": 200, "train/num_rollouts": 1146, "train/mean_group_size": 5.73, "train/error_counts": { "none": 691, "wrong_answer": 335, "runtime": 112, "syntax": 6, "timeout": 2 }, "train/mean_test_pass_frac": 0.6421465968586387, "grpo/loss": -6.980106434566063e-05, "grpo/policy_loss": -7.18922148014875e-05, "grpo/kl": 0.0002088015155534978, "grpo/n_seq": 596, "grpo/n_tokens": 108403, "grpo/mean_abs_adv": 0.8494695751934044, "eval/pass@1": 0.7975609756097561, "eval/pass@5": 0.8597560975609756, "eval_mbpp/pass@1": 0.624, "eval_mbpp/pass@5": 0.69 }, { "iteration": 2, "condition": "A", "time_min": 813.6403118530909, "train/pass_rate": 0.6399317406143344, "train/mean_reward": 0.6740045506257111, "train/reward_std": 0.45920328522941584, "train/informative_groups": 124, "train/num_groups": 200, "train/num_rollouts": 1172, "train/mean_group_size": 5.86, "train/error_counts": { "none": 750, "runtime": 99, "wrong_answer": 315, "syntax": 3, "timeout": 5 }, "train/mean_test_pass_frac": 0.6842434584755405, "grpo/loss": -0.00011221848882273987, "grpo/policy_loss": -0.00011718302083088311, "grpo/kl": 0.0004965064841258027, "grpo/n_seq": 560, "grpo/n_tokens": 99371, "grpo/mean_abs_adv": 0.8516633146460711, "eval/pass@1": 0.7987804878048781, "eval/pass@5": 0.8475609756097561, "eval_mbpp/pass@1": 0.632, "eval_mbpp/pass@5": 0.71 }, { "iteration": 3, "condition": "A", "time_min": 787.4754820307096, "train/pass_rate": 0.6385135135135135, "train/mean_reward": 0.6712978603603603, "train/reward_std": 0.4618140711956283, "train/informative_groups": 123, "train/num_groups": 200, "train/num_rollouts": 1184, "train/mean_group_size": 5.92, "train/error_counts": { "none": 756, "wrong_answer": 317, "runtime": 107, "timeout": 2, "syntax": 2 }, "train/mean_test_pass_frac": 0.681179617117117, "grpo/loss": -2.1914203446325122e-05, "grpo/policy_loss": -3.2540650668425725e-05, "grpo/kl": 0.001062764957962002, "grpo/n_seq": 559, "grpo/n_tokens": 100376, "grpo/mean_abs_adv": 0.8552671855178949, "eval/pass@1": 0.8048780487804879, "eval/pass@5": 0.8536585365853658, "eval_mbpp/pass@1": 0.636, "eval_mbpp/pass@5": 0.69 } ]