# Skill matrix for the Modal finetune + lm-eval + publish pipeline. # # Each entry trains one QLoRA adapter for a skill/category, evaluates it # against the matching slm-lm-eval profile (vs. a per-profile baseline), # checks the result against `goals`, and — only if the gate passes — # publishes the adapter to `publish.hub_repo` on the Hugging Face Hub. # # Baselines are always the unfine-tuned base model (`defaults.preset` → # models.yaml model_id, e.g. openbmb/MiniCPM5-1B), stored as # results/lm_eval/__baseline__/ and reused across jobs. # # Smoke limits (max_steps, max_samples, eval `limit` in the profile configs) # keep hackathon runs affordable; bump them for full runs. # # publish.private is `false` so passing adapters land on the Hub publicly: the # Well-Tuned badge requires a judge-visible, fine-tuned published model. # # Workflows (see modal/README.md): # modal run research/modal/finetune_app.py # full sweep: baselines -> train -> eval -> gate -> publish -> pull # modal run research/modal/finetune_app.py --job math-lora # one skill # modal run research/modal/finetune_app.py --category math # one category # modal run research/modal/finetune_app.py --eval-only --job math-lora # modal run research/modal/finetune_app.py --no-publish # train+eval, skip Hub push # modal run research/modal/finetune_app.py::publish_only --job math-lora # modal run research/modal/finetune_app.py::pull --category math defaults: preset: minicpm5-1b mode: qlora gpu: A10G # QLoRA fits on T4 too; override per job with `gpu: T4` for cheaper runs max_steps: 100 # Hugging Face namespace for published adapters. hub_org: MSGEncrypted # Second eval pass for publish gates: balanced general SLM mix (limit 100). general_eval_profile: compare_study general_goals: guard_tasks: - task: arc_easy max_regress: 0.03 - task: arc_challenge max_regress: 0.03 - task: hellaswag max_regress: 0.03 - task: piqa max_regress: 0.03 - task: boolq max_regress: 0.03 - task: gsm8k max_regress: 0.03 finetune: # --- teaching: lesson-planning agent chat data (Well-Tuned primary) --- # 8 local lesson chats overfit easily; mix in alpaca replay + NEFTune so # IFEval clears without washing out the lesson skill signal. - name: teaching-lora category: teaching max_steps: 150 mix: - dataset: research/data/education-lesson-chat.jsonl format: chat weight: 20 # ~8 samples → ~160 examples - dataset: tatsu-lab/alpaca # instruction-following replay for ifeval format: alpaca dataset_split: "train[:600]" max_samples: 600 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last val_split: 0.05 description: Lesson-planning chat + alpaca replay, r=32 + NEFTune eval_profile: instructions goals: task: ifeval min_score: 0.15 min_improve: 0.02 publish: hub_repo: MSGEncrypted/minicpm5-1b-teaching-lora mirror_repos: - build-small-hackathon/minicpm5-1b-teaching-lora private: false # --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) --- # Previous attempt used chat-format tutoring — wrong signal for MC benchmarks. # Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher. # allenai/sciq train: 11k factual science MC (question→correct_answer). # allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards. # allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval. # Local science-tutor-chat kept at low weight for style/explanation diversity. # MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run). # Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets. - name: science-lora category: science max_steps: 120 mix: - dataset: allenai/sciq # 11k MC science Q→A (in-distribution with sciq eval) format: prompt columns: prompt: question response: correct_answer dataset_split: "train[:1500]" max_samples: 1500 - dataset: allenai/ai2_arc # elementary + challenge science MC format: prompt dataset_config: ARC-Easy columns: prompt: question response: answerKey dataset_split: "train[:500]" max_samples: 500 - dataset: allenai/openbookqa # fact-based open science Q&A format: prompt columns: prompt: question_stem response: answerKey dataset_split: "train[:400]" max_samples: 400 - dataset: research/data/science-tutor-chat.jsonl # style diversity format: chat weight: 4 - dataset: meta-math/MetaMathQA # gsm8k guard protection format: prompt columns: prompt: query response: response dataset_split: "train[:200]" max_samples: 200 - dataset: tatsu-lab/alpaca # general replay: protect hellaswag/piqa/boolq format: alpaca dataset_split: "train[:400]" max_samples: 400 args: lora_r: 16 lora_alpha: 32 early_stopping_patience: 3 val_split: 0.05 description: > sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression. eval_profile: science goals: task: sciq min_score: 0.50 min_improve: 0.02 guard_tasks: - task: arc_challenge max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-science-lora mirror_repos: - build-small-hackathon/minicpm5-1b-science-lora private: false # --- math: GSM8K/MATH natural-language CoT augmentation (MetaMathQA) --- # MetaMathQA's NL chain-of-thought matches GSM8K's 5-shot format far better # than MathInstruct's program-of-thought answers (which regressed gsm8k). # The `mix:` adds a general-data replay slice (alpaca) so skill tuning does # not regress the arc/hellaswag/piqa guards. `args:` flows any # research/finetune.py hyperparameter straight through. - name: math-lora category: math max_steps: 150 mix: - dataset: meta-math/MetaMathQA format: prompt columns: prompt: query response: response dataset_split: "train[:3000]" max_samples: 3000 - dataset: tatsu-lab/alpaca # general replay: protect guard tasks format: alpaca dataset_split: "train[:600]" max_samples: 600 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 description: GSM8K/MATH NL-CoT (MetaMathQA) + alpaca replay, r=32 + NEFTune eval_profile: math goals: task: gsm8k min_score: 0.05 min_improve: 0.02 guard_tasks: - task: arc_challenge max_regress: 0.03 - task: hellaswag max_regress: 0.03 - task: piqa max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-math-lora mirror_repos: - build-small-hackathon/minicpm5-1b-math-lora private: false # --- coding: Python instruction-following code generation --- - name: coding-lora category: coding dataset: iamtarun/python_code_instructions_18k_alpaca format: alpaca dataset_split: "train[:1000]" max_samples: 1000 args: early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last val_split: 0.05 description: Python code instruction tuning (Hub, alpaca columns) eval_profile: code goals: task: mbpp min_score: 0.05 min_improve: 0.01 guard_tasks: - task: hellaswag max_regress: 0.03 - task: piqa max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-coding-lora mirror_repos: - build-small-hackathon/minicpm5-1b-coding-lora private: false # --- reasoning: multi-turn chat with reasoning-heavy conversations --- - name: reasoning-lora category: reasoning dataset: HuggingFaceTB/smoltalk format: chat dataset_config: all dataset_split: "train[:500]" max_samples: 500 args: early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last val_split: 0.05 description: Multi-turn reasoning/chat subset (Hub) eval_profile: reasoning goals: task: gsm8k min_score: 0.05 min_improve: 0.01 guard_tasks: - task: hellaswag max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-reasoning-lora mirror_repos: - build-small-hackathon/minicpm5-1b-reasoning-lora private: false # --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay --- # New vertical. Same overfit-guard recipe as teaching/science: a focused # skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA # improve without regressing the arc_challenge general-knowledge guard. - name: medical-lora category: medical max_steps: 200 mix: - dataset: medalpaca/medical_meadow_medqa # USMLE-style QA, alpaca columns format: alpaca dataset_split: "train[:2000]" max_samples: 2000 - dataset: tatsu-lab/alpaca # general replay: protect guards format: alpaca dataset_split: "train[:600]" max_samples: 600 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 early_stopping_patience: 2 val_split: 0.05 description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune eval_profile: medical goals: task: pubmedqa min_score: 0.45 min_improve: 0.02 guard_tasks: - task: arc_challenge max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-medical-lora mirror_repos: - build-small-hackathon/minicpm5-1b-medical-lora private: false # --- tool-use: function/tool-calling (xLAM) --- # New vertical that closes the loop with the existing BFCL agentic benchmark. # The publish gate guards general ability (lm-eval has no function-call task); # the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark: # uv run --package slm-evals slm-benchmark --model --benchmarks bfcl --max-samples 50 - name: tool-use-lora category: tool_use max_steps: 200 mix: - dataset: Salesforce/xlam-function-calling-60k format: prompt columns: prompt: query response: answers # JSON function-call(s) the model must emit dataset_split: "train[:3000]" max_samples: 3000 - dataset: tatsu-lab/alpaca # general replay: protect guards format: alpaca dataset_split: "train[:600]" max_samples: 600 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 early_stopping_patience: 2 val_split: 0.05 description: > Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability. eval_profile: compare_study goals: task: arc_easy min_improve: 0.0 guard_tasks: - task: hellaswag max_regress: 0.03 - task: piqa max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora mirror_repos: - build-small-hackathon/minicpm5-1b-tool-use-lora private: false # --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) --- # New vertical. In-distribution MC train (question -> answerKey), same recipe # as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards. - name: commonsense-lora category: commonsense max_steps: 150 mix: - dataset: tau/commonsense_qa # 5-way everyday-knowledge MCQ, in-distribution format: prompt columns: prompt: question response: answerKey dataset_split: "train[:2000]" max_samples: 2000 - dataset: tatsu-lab/alpaca # general replay: protect piqa/hellaswag guards format: alpaca dataset_split: "train[:600]" max_samples: 600 args: lora_r: 16 lora_alpha: 32 early_stopping_patience: 2 val_split: 0.05 description: CommonsenseQA MC train + alpaca replay, r=16 eval_profile: commonsense goals: task: commonsense_qa min_score: 0.30 min_improve: 0.02 guard_tasks: - task: piqa max_regress: 0.03 - task: hellaswag max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora mirror_repos: - build-small-hackathon/minicpm5-1b-commonsense-lora private: false # --- general instructions baseline: no goals/publish -> local-only adapter --- - name: alpaca-lora category: instructions dataset: tatsu-lab/alpaca format: alpaca dataset_split: train max_samples: 200 description: General instruction tuning baseline (Hub, local-only) eval_profile: instructions # --- language lessons: FR/AR TeacherVoice coach (Cohere-free stack) --- - name: language-lesson-lora category: language max_steps: 200 mix: - dataset: research/data/language-lesson-fr.jsonl format: chat weight: 12 - dataset: research/data/language-lesson-ar.jsonl format: chat weight: 12 - dataset: research/data/science-tutor-chat.jsonl format: chat weight: 4 - dataset: tatsu-lab/alpaca format: alpaca dataset_split: "train[:400]" max_samples: 400 weight: 1 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 early_stopping_patience: 2 val_split: 0.05 description: > FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via build_language_lesson_chat.py) + English replay eval_profile: multilingual goals: task: xnli min_improve: 0.0 guard_tasks: - task: hellaswag max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-language-lesson-lora mirror_repos: - build-small-hackathon/minicpm5-1b-language-lesson-lora private: false # --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate --- # 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french). # FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli. - name: french-lora category: french max_steps: 150 mix: - dataset: FrancophonIA/english_french format: prompt columns: prompt: english response: french prompt_prefix: "Translate the following English sentence to French:\n" dataset_split: "train[:3000]" max_samples: 3000 - dataset: research/data/language-lesson-fr.jsonl format: chat weight: 6 - dataset: tatsu-lab/alpaca format: alpaca dataset_split: "train[:400]" max_samples: 400 args: lora_r: 32 lora_alpha: 64 neftune_noise_alpha: 5 early_stopping_patience: 2 val_split: 0.05 description: > EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat + alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn). eval_profile: french goals: task: french_bench_xnli min_improve: 0.01 guard_tasks: - task: hellaswag max_regress: 0.03 publish: hub_repo: MSGEncrypted/minicpm5-1b-french-lora mirror_repos: - build-small-hackathon/minicpm5-1b-french-lora private: false