Spaces:

MSGEncrypted
/

lesson-agent-dev

Sleeping

MSG

Feat/monday sprint 2 (#19)

727cb75 18 days ago

612 Bytes

	# Code generation profile — HumanEval + MBPP
	# Run: slm-lm-eval --profile code --preset minicpm5-1b-lesson-lora
	# Note: small models often score low; use --limit 25 for smoke tests.

	profile: code
	claim: Better code generation

	tasks:
	- humaneval
	- mbpp
	- hellaswag # general-capability guard (catch regression from skill tuning)
	- piqa # general-capability guard

	# humaneval/mbpp execute model-generated code; opt in explicitly.
	confirm_run_unsafe_code: true
	num_fewshot: 0
	limit: 50
	seed: 42
	batch_size: auto
	device: auto
	dtype: bfloat16
	trust_remote_code: true
	output_dir: results/lm_eval