lesson-agent-dev / research /evals /configs /lm_eval_code.yaml
MSG
Feat/monday sprint 2 (#19)
727cb75
Raw
History Blame Contribute Delete
612 Bytes
# Code generation profile — HumanEval + MBPP
# Run: slm-lm-eval --profile code --preset minicpm5-1b-lesson-lora
# Note: small models often score low; use --limit 25 for smoke tests.
profile: code
claim: Better code generation
tasks:
- humaneval
- mbpp
- hellaswag # general-capability guard (catch regression from skill tuning)
- piqa # general-capability guard
# humaneval/mbpp execute model-generated code; opt in explicitly.
confirm_run_unsafe_code: true
num_fewshot: 0
limit: 50
seed: 42
batch_size: auto
device: auto
dtype: bfloat16
trust_remote_code: true
output_dir: results/lm_eval