lesson-agent-dev / research /modal /experiments.yaml
MSG
Feat/last hour (#24)
bbff1ca
Raw
History Blame Contribute Delete
16.2 kB
# Skill matrix for the Modal finetune + lm-eval + publish pipeline.
#
# Each entry trains one QLoRA adapter for a skill/category, evaluates it
# against the matching slm-lm-eval profile (vs. a per-profile baseline),
# checks the result against `goals`, and — only if the gate passes —
# publishes the adapter to `publish.hub_repo` on the Hugging Face Hub.
#
# Baselines are always the unfine-tuned base model (`defaults.preset` →
# models.yaml model_id, e.g. openbmb/MiniCPM5-1B), stored as
# results/lm_eval/<preset>__baseline__<profile>/ and reused across jobs.
#
# Smoke limits (max_steps, max_samples, eval `limit` in the profile configs)
# keep hackathon runs affordable; bump them for full runs.
#
# publish.private is `false` so passing adapters land on the Hub publicly: the
# Well-Tuned badge requires a judge-visible, fine-tuned published model.
#
# Workflows (see modal/README.md):
# modal run research/modal/finetune_app.py # full sweep: baselines -> train -> eval -> gate -> publish -> pull
# modal run research/modal/finetune_app.py --job math-lora # one skill
# modal run research/modal/finetune_app.py --category math # one category
# modal run research/modal/finetune_app.py --eval-only --job math-lora
# modal run research/modal/finetune_app.py --no-publish # train+eval, skip Hub push
# modal run research/modal/finetune_app.py::publish_only --job math-lora
# modal run research/modal/finetune_app.py::pull --category math
defaults:
preset: minicpm5-1b
mode: qlora
gpu: A10G # QLoRA fits on T4 too; override per job with `gpu: T4` for cheaper runs
max_steps: 100
# Hugging Face namespace for published adapters.
hub_org: MSGEncrypted
# Second eval pass for publish gates: balanced general SLM mix (limit 100).
general_eval_profile: compare_study
general_goals:
guard_tasks:
- task: arc_easy
max_regress: 0.03
- task: arc_challenge
max_regress: 0.03
- task: hellaswag
max_regress: 0.03
- task: piqa
max_regress: 0.03
- task: boolq
max_regress: 0.03
- task: gsm8k
max_regress: 0.03
finetune:
# --- teaching: lesson-planning agent chat data (Well-Tuned primary) ---
# 8 local lesson chats overfit easily; mix in alpaca replay + NEFTune so
# IFEval clears without washing out the lesson skill signal.
- name: teaching-lora
category: teaching
max_steps: 150
mix:
- dataset: research/data/education-lesson-chat.jsonl
format: chat
weight: 20 # ~8 samples → ~160 examples
- dataset: tatsu-lab/alpaca # instruction-following replay for ifeval
format: alpaca
dataset_split: "train[:600]"
max_samples: 600
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
val_split: 0.05
description: Lesson-planning chat + alpaca replay, r=32 + NEFTune
eval_profile: instructions
goals:
task: ifeval
min_score: 0.15
min_improve: 0.02
publish:
hub_repo: MSGEncrypted/minicpm5-1b-teaching-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-teaching-lora
private: false
# --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
# Previous attempt used chat-format tutoring — wrong signal for MC benchmarks.
# Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
# allenai/sciq train: 11k factual science MC (question→correct_answer).
# allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
# allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
# Local science-tutor-chat kept at low weight for style/explanation diversity.
# MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
# Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
- name: science-lora
category: science
max_steps: 120
mix:
- dataset: allenai/sciq # 11k MC science Q→A (in-distribution with sciq eval)
format: prompt
columns:
prompt: question
response: correct_answer
dataset_split: "train[:1500]"
max_samples: 1500
- dataset: allenai/ai2_arc # elementary + challenge science MC
format: prompt
dataset_config: ARC-Easy
columns:
prompt: question
response: answerKey
dataset_split: "train[:500]"
max_samples: 500
- dataset: allenai/openbookqa # fact-based open science Q&A
format: prompt
columns:
prompt: question_stem
response: answerKey
dataset_split: "train[:400]"
max_samples: 400
- dataset: research/data/science-tutor-chat.jsonl # style diversity
format: chat
weight: 4
- dataset: meta-math/MetaMathQA # gsm8k guard protection
format: prompt
columns:
prompt: query
response: response
dataset_split: "train[:200]"
max_samples: 200
- dataset: tatsu-lab/alpaca # general replay: protect hellaswag/piqa/boolq
format: alpaca
dataset_split: "train[:400]"
max_samples: 400
args:
lora_r: 16
lora_alpha: 32
early_stopping_patience: 3
val_split: 0.05
description: >
sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
eval_profile: science
goals:
task: sciq
min_score: 0.50
min_improve: 0.02
guard_tasks:
- task: arc_challenge
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-science-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-science-lora
private: false
# --- math: GSM8K/MATH natural-language CoT augmentation (MetaMathQA) ---
# MetaMathQA's NL chain-of-thought matches GSM8K's 5-shot format far better
# than MathInstruct's program-of-thought answers (which regressed gsm8k).
# The `mix:` adds a general-data replay slice (alpaca) so skill tuning does
# not regress the arc/hellaswag/piqa guards. `args:` flows any
# research/finetune.py hyperparameter straight through.
- name: math-lora
category: math
max_steps: 150
mix:
- dataset: meta-math/MetaMathQA
format: prompt
columns:
prompt: query
response: response
dataset_split: "train[:3000]"
max_samples: 3000
- dataset: tatsu-lab/alpaca # general replay: protect guard tasks
format: alpaca
dataset_split: "train[:600]"
max_samples: 600
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
description: GSM8K/MATH NL-CoT (MetaMathQA) + alpaca replay, r=32 + NEFTune
eval_profile: math
goals:
task: gsm8k
min_score: 0.05
min_improve: 0.02
guard_tasks:
- task: arc_challenge
max_regress: 0.03
- task: hellaswag
max_regress: 0.03
- task: piqa
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-math-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-math-lora
private: false
# --- coding: Python instruction-following code generation ---
- name: coding-lora
category: coding
dataset: iamtarun/python_code_instructions_18k_alpaca
format: alpaca
dataset_split: "train[:1000]"
max_samples: 1000
args:
early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
val_split: 0.05
description: Python code instruction tuning (Hub, alpaca columns)
eval_profile: code
goals:
task: mbpp
min_score: 0.05
min_improve: 0.01
guard_tasks:
- task: hellaswag
max_regress: 0.03
- task: piqa
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-coding-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-coding-lora
private: false
# --- reasoning: multi-turn chat with reasoning-heavy conversations ---
- name: reasoning-lora
category: reasoning
dataset: HuggingFaceTB/smoltalk
format: chat
dataset_config: all
dataset_split: "train[:500]"
max_samples: 500
args:
early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
val_split: 0.05
description: Multi-turn reasoning/chat subset (Hub)
eval_profile: reasoning
goals:
task: gsm8k
min_score: 0.05
min_improve: 0.01
guard_tasks:
- task: hellaswag
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-reasoning-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-reasoning-lora
private: false
# --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
# New vertical. Same overfit-guard recipe as teaching/science: a focused
# skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
# improve without regressing the arc_challenge general-knowledge guard.
- name: medical-lora
category: medical
max_steps: 200
mix:
- dataset: medalpaca/medical_meadow_medqa # USMLE-style QA, alpaca columns
format: alpaca
dataset_split: "train[:2000]"
max_samples: 2000
- dataset: tatsu-lab/alpaca # general replay: protect guards
format: alpaca
dataset_split: "train[:600]"
max_samples: 600
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
early_stopping_patience: 2
val_split: 0.05
description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
eval_profile: medical
goals:
task: pubmedqa
min_score: 0.45
min_improve: 0.02
guard_tasks:
- task: arc_challenge
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-medical-lora
private: false
# --- tool-use: function/tool-calling (xLAM) ---
# New vertical that closes the loop with the existing BFCL agentic benchmark.
# The publish gate guards general ability (lm-eval has no function-call task);
# the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark:
# uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
- name: tool-use-lora
category: tool_use
max_steps: 200
mix:
- dataset: Salesforce/xlam-function-calling-60k
format: prompt
columns:
prompt: query
response: answers # JSON function-call(s) the model must emit
dataset_split: "train[:3000]"
max_samples: 3000
- dataset: tatsu-lab/alpaca # general replay: protect guards
format: alpaca
dataset_split: "train[:600]"
max_samples: 600
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
early_stopping_patience: 2
val_split: 0.05
description: >
Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
eval_profile: compare_study
goals:
task: arc_easy
min_improve: 0.0
guard_tasks:
- task: hellaswag
max_regress: 0.03
- task: piqa
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-tool-use-lora
private: false
# --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
# New vertical. In-distribution MC train (question -> answerKey), same recipe
# as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
- name: commonsense-lora
category: commonsense
max_steps: 150
mix:
- dataset: tau/commonsense_qa # 5-way everyday-knowledge MCQ, in-distribution
format: prompt
columns:
prompt: question
response: answerKey
dataset_split: "train[:2000]"
max_samples: 2000
- dataset: tatsu-lab/alpaca # general replay: protect piqa/hellaswag guards
format: alpaca
dataset_split: "train[:600]"
max_samples: 600
args:
lora_r: 16
lora_alpha: 32
early_stopping_patience: 2
val_split: 0.05
description: CommonsenseQA MC train + alpaca replay, r=16
eval_profile: commonsense
goals:
task: commonsense_qa
min_score: 0.30
min_improve: 0.02
guard_tasks:
- task: piqa
max_regress: 0.03
- task: hellaswag
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-commonsense-lora
private: false
# --- general instructions baseline: no goals/publish -> local-only adapter ---
- name: alpaca-lora
category: instructions
dataset: tatsu-lab/alpaca
format: alpaca
dataset_split: train
max_samples: 200
description: General instruction tuning baseline (Hub, local-only)
eval_profile: instructions
# --- language lessons: FR/AR TeacherVoice coach (Cohere-free stack) ---
- name: language-lesson-lora
category: language
max_steps: 200
mix:
- dataset: research/data/language-lesson-fr.jsonl
format: chat
weight: 12
- dataset: research/data/language-lesson-ar.jsonl
format: chat
weight: 12
- dataset: research/data/science-tutor-chat.jsonl
format: chat
weight: 4
- dataset: tatsu-lab/alpaca
format: alpaca
dataset_split: "train[:400]"
max_samples: 400
weight: 1
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
early_stopping_patience: 2
val_split: 0.05
description: >
FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
build_language_lesson_chat.py) + English replay
eval_profile: multilingual
goals:
task: xnli
min_improve: 0.0
guard_tasks:
- task: hellaswag
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-language-lesson-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-language-lesson-lora
private: false
# --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
# 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
# FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
- name: french-lora
category: french
max_steps: 150
mix:
- dataset: FrancophonIA/english_french
format: prompt
columns:
prompt: english
response: french
prompt_prefix: "Translate the following English sentence to French:\n"
dataset_split: "train[:3000]"
max_samples: 3000
- dataset: research/data/language-lesson-fr.jsonl
format: chat
weight: 6
- dataset: tatsu-lab/alpaca
format: alpaca
dataset_split: "train[:400]"
max_samples: 400
args:
lora_r: 32
lora_alpha: 64
neftune_noise_alpha: 5
early_stopping_patience: 2
val_split: 0.05
description: >
EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
eval_profile: french
goals:
task: french_bench_xnli
min_improve: 0.01
guard_tasks:
- task: hellaswag
max_regress: 0.03
publish:
hub_repo: MSGEncrypted/minicpm5-1b-french-lora
mirror_repos:
- build-small-hackathon/minicpm5-1b-french-lora
private: false