Spaces:

MSGEncrypted
/

lesson-agent-dev

Sleeping

lesson-agent-dev / research /modal /experiments.yaml

MSG

Feat/last hour (#24)

bbff1ca 18 days ago

16.2 kB

	# Skill matrix for the Modal finetune + lm-eval + publish pipeline.
	#
	# Each entry trains one QLoRA adapter for a skill/category, evaluates it
	# against the matching slm-lm-eval profile (vs. a per-profile baseline),
	# checks the result against `goals`, and — only if the gate passes —
	# publishes the adapter to `publish.hub_repo` on the Hugging Face Hub.
	#
	# Baselines are always the unfine-tuned base model (`defaults.preset` →
	# models.yaml model_id, e.g. openbmb/MiniCPM5-1B), stored as
	# results/lm_eval/<preset>__baseline__<profile>/ and reused across jobs.
	#
	# Smoke limits (max_steps, max_samples, eval `limit` in the profile configs)
	# keep hackathon runs affordable; bump them for full runs.
	#
	# publish.private is `false` so passing adapters land on the Hub publicly: the
	# Well-Tuned badge requires a judge-visible, fine-tuned published model.
	#
	# Workflows (see modal/README.md):
	# modal run research/modal/finetune_app.py # full sweep: baselines -> train -> eval -> gate -> publish -> pull
	# modal run research/modal/finetune_app.py --job math-lora # one skill
	# modal run research/modal/finetune_app.py --category math # one category
	# modal run research/modal/finetune_app.py --eval-only --job math-lora
	# modal run research/modal/finetune_app.py --no-publish # train+eval, skip Hub push
	# modal run research/modal/finetune_app.py::publish_only --job math-lora
	# modal run research/modal/finetune_app.py::pull --category math

	defaults:
	preset: minicpm5-1b
	mode: qlora
	gpu: A10G # QLoRA fits on T4 too; override per job with `gpu: T4` for cheaper runs
	max_steps: 100
	# Hugging Face namespace for published adapters.
	hub_org: MSGEncrypted
	# Second eval pass for publish gates: balanced general SLM mix (limit 100).
	general_eval_profile: compare_study
	general_goals:
	guard_tasks:
	- task: arc_easy
	max_regress: 0.03
	- task: arc_challenge
	max_regress: 0.03
	- task: hellaswag
	max_regress: 0.03
	- task: piqa
	max_regress: 0.03
	- task: boolq
	max_regress: 0.03
	- task: gsm8k
	max_regress: 0.03

	finetune:
	# --- teaching: lesson-planning agent chat data (Well-Tuned primary) ---
	# 8 local lesson chats overfit easily; mix in alpaca replay + NEFTune so
	# IFEval clears without washing out the lesson skill signal.
	- name: teaching-lora
	category: teaching
	max_steps: 150
	mix:
	- dataset: research/data/education-lesson-chat.jsonl
	format: chat
	weight: 20 # ~8 samples → ~160 examples
	- dataset: tatsu-lab/alpaca # instruction-following replay for ifeval
	format: alpaca
	dataset_split: "train[:600]"
	max_samples: 600
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
	val_split: 0.05
	description: Lesson-planning chat + alpaca replay, r=32 + NEFTune
	eval_profile: instructions
	goals:
	task: ifeval
	min_score: 0.15
	min_improve: 0.02
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-teaching-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-teaching-lora
	private: false

	# --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
	# Previous attempt used chat-format tutoring — wrong signal for MC benchmarks.
	# Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
	# allenai/sciq train: 11k factual science MC (question→correct_answer).
	# allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
	# allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
	# Local science-tutor-chat kept at low weight for style/explanation diversity.
	# MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
	# Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
	- name: science-lora
	category: science
	max_steps: 120
	mix:
	- dataset: allenai/sciq # 11k MC science Q→A (in-distribution with sciq eval)
	format: prompt
	columns:
	prompt: question
	response: correct_answer
	dataset_split: "train[:1500]"
	max_samples: 1500
	- dataset: allenai/ai2_arc # elementary + challenge science MC
	format: prompt
	dataset_config: ARC-Easy
	columns:
	prompt: question
	response: answerKey
	dataset_split: "train[:500]"
	max_samples: 500
	- dataset: allenai/openbookqa # fact-based open science Q&A
	format: prompt
	columns:
	prompt: question_stem
	response: answerKey
	dataset_split: "train[:400]"
	max_samples: 400
	- dataset: research/data/science-tutor-chat.jsonl # style diversity
	format: chat
	weight: 4
	- dataset: meta-math/MetaMathQA # gsm8k guard protection
	format: prompt
	columns:
	prompt: query
	response: response
	dataset_split: "train[:200]"
	max_samples: 200
	- dataset: tatsu-lab/alpaca # general replay: protect hellaswag/piqa/boolq
	format: alpaca
	dataset_split: "train[:400]"
	max_samples: 400
	args:
	lora_r: 16
	lora_alpha: 32
	early_stopping_patience: 3
	val_split: 0.05
	description: >
	sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
	guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
	eval_profile: science
	goals:
	task: sciq
	min_score: 0.50
	min_improve: 0.02
	guard_tasks:
	- task: arc_challenge
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-science-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-science-lora
	private: false

	# --- math: GSM8K/MATH natural-language CoT augmentation (MetaMathQA) ---
	# MetaMathQA's NL chain-of-thought matches GSM8K's 5-shot format far better
	# than MathInstruct's program-of-thought answers (which regressed gsm8k).
	# The `mix:` adds a general-data replay slice (alpaca) so skill tuning does
	# not regress the arc/hellaswag/piqa guards. `args:` flows any
	# research/finetune.py hyperparameter straight through.
	- name: math-lora
	category: math
	max_steps: 150
	mix:
	- dataset: meta-math/MetaMathQA
	format: prompt
	columns:
	prompt: query
	response: response
	dataset_split: "train[:3000]"
	max_samples: 3000
	- dataset: tatsu-lab/alpaca # general replay: protect guard tasks
	format: alpaca
	dataset_split: "train[:600]"
	max_samples: 600
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	description: GSM8K/MATH NL-CoT (MetaMathQA) + alpaca replay, r=32 + NEFTune
	eval_profile: math
	goals:
	task: gsm8k
	min_score: 0.05
	min_improve: 0.02
	guard_tasks:
	- task: arc_challenge
	max_regress: 0.03
	- task: hellaswag
	max_regress: 0.03
	- task: piqa
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-math-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-math-lora
	private: false

	# --- coding: Python instruction-following code generation ---
	- name: coding-lora
	category: coding
	dataset: iamtarun/python_code_instructions_18k_alpaca
	format: alpaca
	dataset_split: "train[:1000]"
	max_samples: 1000
	args:
	early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
	val_split: 0.05
	description: Python code instruction tuning (Hub, alpaca columns)
	eval_profile: code
	goals:
	task: mbpp
	min_score: 0.05
	min_improve: 0.01
	guard_tasks:
	- task: hellaswag
	max_regress: 0.03
	- task: piqa
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-coding-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-coding-lora
	private: false

	# --- reasoning: multi-turn chat with reasoning-heavy conversations ---
	- name: reasoning-lora
	category: reasoning
	dataset: HuggingFaceTB/smoltalk
	format: chat
	dataset_config: all
	dataset_split: "train[:500]"
	max_samples: 500
	args:
	early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
	val_split: 0.05
	description: Multi-turn reasoning/chat subset (Hub)
	eval_profile: reasoning
	goals:
	task: gsm8k
	min_score: 0.05
	min_improve: 0.01
	guard_tasks:
	- task: hellaswag
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-reasoning-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-reasoning-lora
	private: false

	# --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
	# New vertical. Same overfit-guard recipe as teaching/science: a focused
	# skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
	# improve without regressing the arc_challenge general-knowledge guard.
	- name: medical-lora
	category: medical
	max_steps: 200
	mix:
	- dataset: medalpaca/medical_meadow_medqa # USMLE-style QA, alpaca columns
	format: alpaca
	dataset_split: "train[:2000]"
	max_samples: 2000
	- dataset: tatsu-lab/alpaca # general replay: protect guards
	format: alpaca
	dataset_split: "train[:600]"
	max_samples: 600
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	early_stopping_patience: 2
	val_split: 0.05
	description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
	eval_profile: medical
	goals:
	task: pubmedqa
	min_score: 0.45
	min_improve: 0.02
	guard_tasks:
	- task: arc_challenge
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-medical-lora
	private: false

	# --- tool-use: function/tool-calling (xLAM) ---
	# New vertical that closes the loop with the existing BFCL agentic benchmark.
	# The publish gate guards general ability (lm-eval has no function-call task);
	# the skill metric is the BFCL/tau-bench suite run via slm-benchmark:
	# uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
	- name: tool-use-lora
	category: tool_use
	max_steps: 200
	mix:
	- dataset: Salesforce/xlam-function-calling-60k
	format: prompt
	columns:
	prompt: query
	response: answers # JSON function-call(s) the model must emit
	dataset_split: "train[:3000]"
	max_samples: 3000
	- dataset: tatsu-lab/alpaca # general replay: protect guards
	format: alpaca
	dataset_split: "train[:600]"
	max_samples: 600
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	early_stopping_patience: 2
	val_split: 0.05
	description: >
	Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
	BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
	eval_profile: compare_study
	goals:
	task: arc_easy
	min_improve: 0.0
	guard_tasks:
	- task: hellaswag
	max_regress: 0.03
	- task: piqa
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-tool-use-lora
	private: false

	# --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
	# New vertical. In-distribution MC train (question -> answerKey), same recipe
	# as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
	- name: commonsense-lora
	category: commonsense
	max_steps: 150
	mix:
	- dataset: tau/commonsense_qa # 5-way everyday-knowledge MCQ, in-distribution
	format: prompt
	columns:
	prompt: question
	response: answerKey
	dataset_split: "train[:2000]"
	max_samples: 2000
	- dataset: tatsu-lab/alpaca # general replay: protect piqa/hellaswag guards
	format: alpaca
	dataset_split: "train[:600]"
	max_samples: 600
	args:
	lora_r: 16
	lora_alpha: 32
	early_stopping_patience: 2
	val_split: 0.05
	description: CommonsenseQA MC train + alpaca replay, r=16
	eval_profile: commonsense
	goals:
	task: commonsense_qa
	min_score: 0.30
	min_improve: 0.02
	guard_tasks:
	- task: piqa
	max_regress: 0.03
	- task: hellaswag
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-commonsense-lora
	private: false

	# --- general instructions baseline: no goals/publish -> local-only adapter ---
	- name: alpaca-lora
	category: instructions
	dataset: tatsu-lab/alpaca
	format: alpaca
	dataset_split: train
	max_samples: 200
	description: General instruction tuning baseline (Hub, local-only)
	eval_profile: instructions

	# --- language lessons: FR/AR TeacherVoice coach (Cohere-free stack) ---
	- name: language-lesson-lora
	category: language
	max_steps: 200
	mix:
	- dataset: research/data/language-lesson-fr.jsonl
	format: chat
	weight: 12
	- dataset: research/data/language-lesson-ar.jsonl
	format: chat
	weight: 12
	- dataset: research/data/science-tutor-chat.jsonl
	format: chat
	weight: 4
	- dataset: tatsu-lab/alpaca
	format: alpaca
	dataset_split: "train[:400]"
	max_samples: 400
	weight: 1
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	early_stopping_patience: 2
	val_split: 0.05
	description: >
	FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
	build_language_lesson_chat.py) + English replay
	eval_profile: multilingual
	goals:
	task: xnli
	min_improve: 0.0
	guard_tasks:
	- task: hellaswag
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-language-lesson-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-language-lesson-lora
	private: false

	# --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
	# 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
	# FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
	- name: french-lora
	category: french
	max_steps: 150
	mix:
	- dataset: FrancophonIA/english_french
	format: prompt
	columns:
	prompt: english
	response: french
	prompt_prefix: "Translate the following English sentence to French:\n"
	dataset_split: "train[:3000]"
	max_samples: 3000
	- dataset: research/data/language-lesson-fr.jsonl
	format: chat
	weight: 6
	- dataset: tatsu-lab/alpaca
	format: alpaca
	dataset_split: "train[:400]"
	max_samples: 400
	args:
	lora_r: 32
	lora_alpha: 64
	neftune_noise_alpha: 5
	early_stopping_patience: 2
	val_split: 0.05
	description: >
	EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
	alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
	eval_profile: french
	goals:
	task: french_bench_xnli
	min_improve: 0.01
	guard_tasks:
	- task: hellaswag
	max_regress: 0.03
	publish:
	hub_repo: MSGEncrypted/minicpm5-1b-french-lora
	mirror_repos:
	- build-small-hackathon/minicpm5-1b-french-lora
	private: false