# Code generation profile — HumanEval + MBPP # Run: slm-lm-eval --profile code --preset minicpm5-1b-lesson-lora # Note: small models often score low; use --limit 25 for smoke tests. profile: code claim: Better code generation tasks: - humaneval - mbpp - hellaswag # general-capability guard (catch regression from skill tuning) - piqa # general-capability guard # humaneval/mbpp execute model-generated code; opt in explicitly. confirm_run_unsafe_code: true num_fewshot: 0 limit: 50 seed: 42 batch_size: auto device: auto dtype: bfloat16 trust_remote_code: true output_dir: results/lm_eval