Spaces:
Sleeping
Sleeping
| # Code generation profile — HumanEval + MBPP | |
| # Run: slm-lm-eval --profile code --preset minicpm5-1b-lesson-lora | |
| # Note: small models often score low; use --limit 25 for smoke tests. | |
| profile: code | |
| claim: Better code generation | |
| tasks: | |
| - humaneval | |
| - mbpp | |
| - hellaswag # general-capability guard (catch regression from skill tuning) | |
| - piqa # general-capability guard | |
| # humaneval/mbpp execute model-generated code; opt in explicitly. | |
| confirm_run_unsafe_code: true | |
| num_fewshot: 0 | |
| limit: 50 | |
| seed: 42 | |
| batch_size: auto | |
| device: auto | |
| dtype: bfloat16 | |
| trust_remote_code: true | |
| output_dir: results/lm_eval | |