spark-code-A-3b / metrics.json
amarsaikhan's picture
Initial capstone artifact upload
3ec8e56 verified
[
{
"iteration": 0,
"condition": "A",
"eval/pass@1": 0.7963414634146343,
"eval/pass@5": 0.8536585365853658,
"eval_mbpp/pass@1": 0.6339999999999999,
"eval_mbpp/pass@5": 0.68
},
{
"iteration": 1,
"condition": "A",
"time_min": 793.5286037087441,
"train/pass_rate": 0.6029668411867365,
"train/mean_reward": 0.630802792321117,
"train/reward_std": 0.4778792950119141,
"train/informative_groups": 132,
"train/num_groups": 200,
"train/num_rollouts": 1146,
"train/mean_group_size": 5.73,
"train/error_counts": {
"none": 691,
"wrong_answer": 335,
"runtime": 112,
"syntax": 6,
"timeout": 2
},
"train/mean_test_pass_frac": 0.6421465968586387,
"grpo/loss": -6.980106434566063e-05,
"grpo/policy_loss": -7.18922148014875e-05,
"grpo/kl": 0.0002088015155534978,
"grpo/n_seq": 596,
"grpo/n_tokens": 108403,
"grpo/mean_abs_adv": 0.8494695751934044,
"eval/pass@1": 0.7975609756097561,
"eval/pass@5": 0.8597560975609756,
"eval_mbpp/pass@1": 0.624,
"eval_mbpp/pass@5": 0.69
},
{
"iteration": 2,
"condition": "A",
"time_min": 813.6403118530909,
"train/pass_rate": 0.6399317406143344,
"train/mean_reward": 0.6740045506257111,
"train/reward_std": 0.45920328522941584,
"train/informative_groups": 124,
"train/num_groups": 200,
"train/num_rollouts": 1172,
"train/mean_group_size": 5.86,
"train/error_counts": {
"none": 750,
"runtime": 99,
"wrong_answer": 315,
"syntax": 3,
"timeout": 5
},
"train/mean_test_pass_frac": 0.6842434584755405,
"grpo/loss": -0.00011221848882273987,
"grpo/policy_loss": -0.00011718302083088311,
"grpo/kl": 0.0004965064841258027,
"grpo/n_seq": 560,
"grpo/n_tokens": 99371,
"grpo/mean_abs_adv": 0.8516633146460711,
"eval/pass@1": 0.7987804878048781,
"eval/pass@5": 0.8475609756097561,
"eval_mbpp/pass@1": 0.632,
"eval_mbpp/pass@5": 0.71
},
{
"iteration": 3,
"condition": "A",
"time_min": 787.4754820307096,
"train/pass_rate": 0.6385135135135135,
"train/mean_reward": 0.6712978603603603,
"train/reward_std": 0.4618140711956283,
"train/informative_groups": 123,
"train/num_groups": 200,
"train/num_rollouts": 1184,
"train/mean_group_size": 5.92,
"train/error_counts": {
"none": 756,
"wrong_answer": 317,
"runtime": 107,
"timeout": 2,
"syntax": 2
},
"train/mean_test_pass_frac": 0.681179617117117,
"grpo/loss": -2.1914203446325122e-05,
"grpo/policy_loss": -3.2540650668425725e-05,
"grpo/kl": 0.001062764957962002,
"grpo/n_seq": 559,
"grpo/n_tokens": 100376,
"grpo/mean_abs_adv": 0.8552671855178949,
"eval/pass@1": 0.8048780487804879,
"eval/pass@5": 0.8536585365853658,
"eval_mbpp/pass@1": 0.636,
"eval_mbpp/pass@5": 0.69
}
]