# Skill matrix for the Modal finetune + lm-eval + publish pipeline.
#
# Each entry trains one QLoRA adapter for a skill/category, evaluates it
# against the matching slm-lm-eval profile (vs. a per-profile baseline),
# checks the result against `goals`, and — only if the gate passes —
# publishes the adapter to `publish.hub_repo` on the Hugging Face Hub.
#
# Baselines are always the unfine-tuned base model (`defaults.preset` →
# models.yaml model_id, e.g. openbmb/MiniCPM5-1B), stored as
# results/lm_eval/<preset>__baseline__<profile>/ and reused across jobs.
#
# Smoke limits (max_steps, max_samples, eval `limit` in the profile configs)
# keep hackathon runs affordable; bump them for full runs.
#
# publish.private is `false` so passing adapters land on the Hub publicly: the
# Well-Tuned badge requires a judge-visible, fine-tuned published model.
#
# Workflows (see modal/README.md):
#   modal run research/modal/finetune_app.py                    # full sweep: baselines -> train -> eval -> gate -> publish -> pull
#   modal run research/modal/finetune_app.py --job math-lora     # one skill
#   modal run research/modal/finetune_app.py --category math     # one category
#   modal run research/modal/finetune_app.py --eval-only --job math-lora
#   modal run research/modal/finetune_app.py --no-publish        # train+eval, skip Hub push
#   modal run research/modal/finetune_app.py::publish_only --job math-lora
#   modal run research/modal/finetune_app.py::pull --category math

defaults:
  preset: minicpm5-1b
  mode: qlora
  gpu: A10G          # QLoRA fits on T4 too; override per job with `gpu: T4` for cheaper runs
  max_steps: 100
  # Hugging Face namespace for published adapters.
  hub_org: MSGEncrypted
  # Second eval pass for publish gates: balanced general SLM mix (limit 100).
  general_eval_profile: compare_study
  general_goals:
    guard_tasks:
      - task: arc_easy
        max_regress: 0.03
      - task: arc_challenge
        max_regress: 0.03
      - task: hellaswag
        max_regress: 0.03
      - task: piqa
        max_regress: 0.03
      - task: boolq
        max_regress: 0.03
      - task: gsm8k
        max_regress: 0.03

finetune:
  # --- teaching: lesson-planning agent chat data (Well-Tuned primary) ---
  # 8 local lesson chats overfit easily; mix in alpaca replay + NEFTune so
  # IFEval clears without washing out the lesson skill signal.
  - name: teaching-lora
    category: teaching
    max_steps: 150
    mix:
      - dataset: research/data/education-lesson-chat.jsonl
        format: chat
        weight: 20                    # ~8 samples → ~160 examples
      - dataset: tatsu-lab/alpaca      # instruction-following replay for ifeval
        format: alpaca
        dataset_split: "train[:600]"
        max_samples: 600
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
      early_stopping_patience: 2   # keep best eval_loss checkpoint, not the last
      val_split: 0.05
    description: Lesson-planning chat + alpaca replay, r=32 + NEFTune
    eval_profile: instructions
    goals:
      task: ifeval
      min_score: 0.15
      min_improve: 0.02
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-teaching-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-teaching-lora
      private: false

  # --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
  # Previous attempt used chat-format tutoring — wrong signal for MC benchmarks.
  # Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
  # allenai/sciq train: 11k factual science MC (question→correct_answer).
  # allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
  # allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
  # Local science-tutor-chat kept at low weight for style/explanation diversity.
  # MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
  # Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
  - name: science-lora
    category: science
    max_steps: 120
    mix:
      - dataset: allenai/sciq           # 11k MC science Q→A (in-distribution with sciq eval)
        format: prompt
        columns:
          prompt: question
          response: correct_answer
        dataset_split: "train[:1500]"
        max_samples: 1500
      - dataset: allenai/ai2_arc        # elementary + challenge science MC
        format: prompt
        dataset_config: ARC-Easy
        columns:
          prompt: question
          response: answerKey
        dataset_split: "train[:500]"
        max_samples: 500
      - dataset: allenai/openbookqa     # fact-based open science Q&A
        format: prompt
        columns:
          prompt: question_stem
          response: answerKey
        dataset_split: "train[:400]"
        max_samples: 400
      - dataset: research/data/science-tutor-chat.jsonl   # style diversity
        format: chat
        weight: 4
      - dataset: meta-math/MetaMathQA   # gsm8k guard protection
        format: prompt
        columns:
          prompt: query
          response: response
        dataset_split: "train[:200]"
        max_samples: 200
      - dataset: tatsu-lab/alpaca       # general replay: protect hellaswag/piqa/boolq
        format: alpaca
        dataset_split: "train[:400]"
        max_samples: 400
    args:
      lora_r: 16
      lora_alpha: 32
      early_stopping_patience: 3
      val_split: 0.05
    description: >
      sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
      guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
    eval_profile: science
    goals:
      task: sciq
      min_score: 0.50
      min_improve: 0.02
      guard_tasks:
        - task: arc_challenge
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-science-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-science-lora
      private: false

  # --- math: GSM8K/MATH natural-language CoT augmentation (MetaMathQA) ---
  # MetaMathQA's NL chain-of-thought matches GSM8K's 5-shot format far better
  # than MathInstruct's program-of-thought answers (which regressed gsm8k).
  # The `mix:` adds a general-data replay slice (alpaca) so skill tuning does
  # not regress the arc/hellaswag/piqa guards. `args:` flows any
  # research/finetune.py hyperparameter straight through.
  - name: math-lora
    category: math
    max_steps: 150
    mix:
      - dataset: meta-math/MetaMathQA
        format: prompt
        columns:
          prompt: query
          response: response
        dataset_split: "train[:3000]"
        max_samples: 3000
      - dataset: tatsu-lab/alpaca          # general replay: protect guard tasks
        format: alpaca
        dataset_split: "train[:600]"
        max_samples: 600
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
    description: GSM8K/MATH NL-CoT (MetaMathQA) + alpaca replay, r=32 + NEFTune
    eval_profile: math
    goals:
      task: gsm8k
      min_score: 0.05
      min_improve: 0.02
      guard_tasks:
        - task: arc_challenge
          max_regress: 0.03
        - task: hellaswag
          max_regress: 0.03
        - task: piqa
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-math-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-math-lora
      private: false

  # --- coding: Python instruction-following code generation ---
  - name: coding-lora
    category: coding
    dataset: iamtarun/python_code_instructions_18k_alpaca
    format: alpaca
    dataset_split: "train[:1000]"
    max_samples: 1000
    args:
      early_stopping_patience: 2   # keep best eval_loss checkpoint, not the last
      val_split: 0.05
    description: Python code instruction tuning (Hub, alpaca columns)
    eval_profile: code
    goals:
      task: mbpp
      min_score: 0.05
      min_improve: 0.01
      guard_tasks:
        - task: hellaswag
          max_regress: 0.03
        - task: piqa
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-coding-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-coding-lora
      private: false

  # --- reasoning: multi-turn chat with reasoning-heavy conversations ---
  - name: reasoning-lora
    category: reasoning
    dataset: HuggingFaceTB/smoltalk
    format: chat
    dataset_config: all
    dataset_split: "train[:500]"
    max_samples: 500
    args:
      early_stopping_patience: 2   # keep best eval_loss checkpoint, not the last
      val_split: 0.05
    description: Multi-turn reasoning/chat subset (Hub)
    eval_profile: reasoning
    goals:
      task: gsm8k
      min_score: 0.05
      min_improve: 0.01
      guard_tasks:
        - task: hellaswag
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-reasoning-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-reasoning-lora
      private: false

  # --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
  # New vertical. Same overfit-guard recipe as teaching/science: a focused
  # skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
  # improve without regressing the arc_challenge general-knowledge guard.
  - name: medical-lora
    category: medical
    max_steps: 200
    mix:
      - dataset: medalpaca/medical_meadow_medqa   # USMLE-style QA, alpaca columns
        format: alpaca
        dataset_split: "train[:2000]"
        max_samples: 2000
      - dataset: tatsu-lab/alpaca                  # general replay: protect guards
        format: alpaca
        dataset_split: "train[:600]"
        max_samples: 600
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
      early_stopping_patience: 2
      val_split: 0.05
    description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
    eval_profile: medical
    goals:
      task: pubmedqa
      min_score: 0.45
      min_improve: 0.02
      guard_tasks:
        - task: arc_challenge
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-medical-lora
      private: false

  # --- tool-use: function/tool-calling (xLAM) ---
  # New vertical that closes the loop with the existing BFCL agentic benchmark.
  # The publish gate guards general ability (lm-eval has no function-call task);
  # the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark:
  #   uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
  - name: tool-use-lora
    category: tool_use
    max_steps: 200
    mix:
      - dataset: Salesforce/xlam-function-calling-60k
        format: prompt
        columns:
          prompt: query
          response: answers           # JSON function-call(s) the model must emit
        dataset_split: "train[:3000]"
        max_samples: 3000
      - dataset: tatsu-lab/alpaca       # general replay: protect guards
        format: alpaca
        dataset_split: "train[:600]"
        max_samples: 600
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
      early_stopping_patience: 2
      val_split: 0.05
    description: >
      Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
      BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
    eval_profile: compare_study
    goals:
      task: arc_easy
      min_improve: 0.0
      guard_tasks:
        - task: hellaswag
          max_regress: 0.03
        - task: piqa
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-tool-use-lora
      private: false

  # --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
  # New vertical. In-distribution MC train (question -> answerKey), same recipe
  # as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
  - name: commonsense-lora
    category: commonsense
    max_steps: 150
    mix:
      - dataset: tau/commonsense_qa     # 5-way everyday-knowledge MCQ, in-distribution
        format: prompt
        columns:
          prompt: question
          response: answerKey
        dataset_split: "train[:2000]"
        max_samples: 2000
      - dataset: tatsu-lab/alpaca       # general replay: protect piqa/hellaswag guards
        format: alpaca
        dataset_split: "train[:600]"
        max_samples: 600
    args:
      lora_r: 16
      lora_alpha: 32
      early_stopping_patience: 2
      val_split: 0.05
    description: CommonsenseQA MC train + alpaca replay, r=16
    eval_profile: commonsense
    goals:
      task: commonsense_qa
      min_score: 0.30
      min_improve: 0.02
      guard_tasks:
        - task: piqa
          max_regress: 0.03
        - task: hellaswag
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-commonsense-lora
      private: false

  # --- general instructions baseline: no goals/publish -> local-only adapter ---
  - name: alpaca-lora
    category: instructions
    dataset: tatsu-lab/alpaca
    format: alpaca
    dataset_split: train
    max_samples: 200
    description: General instruction tuning baseline (Hub, local-only)
    eval_profile: instructions

  # --- language lessons: FR/AR TeacherVoice coach (Cohere-free stack) ---
  - name: language-lesson-lora
    category: language
    max_steps: 200
    mix:
      - dataset: research/data/language-lesson-fr.jsonl
        format: chat
        weight: 12
      - dataset: research/data/language-lesson-ar.jsonl
        format: chat
        weight: 12
      - dataset: research/data/science-tutor-chat.jsonl
        format: chat
        weight: 4
      - dataset: tatsu-lab/alpaca
        format: alpaca
        dataset_split: "train[:400]"
        max_samples: 400
        weight: 1
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
      early_stopping_patience: 2
      val_split: 0.05
    description: >
      FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
      build_language_lesson_chat.py) + English replay
    eval_profile: multilingual
    goals:
      task: xnli
      min_improve: 0.0
      guard_tasks:
        - task: hellaswag
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-language-lesson-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-language-lesson-lora
      private: false

  # --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
  # 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
  # FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
  - name: french-lora
    category: french
    max_steps: 150
    mix:
      - dataset: FrancophonIA/english_french
        format: prompt
        columns:
          prompt: english
          response: french
        prompt_prefix: "Translate the following English sentence to French:\n"
        dataset_split: "train[:3000]"
        max_samples: 3000
      - dataset: research/data/language-lesson-fr.jsonl
        format: chat
        weight: 6
      - dataset: tatsu-lab/alpaca
        format: alpaca
        dataset_split: "train[:400]"
        max_samples: 400
    args:
      lora_r: 32
      lora_alpha: 64
      neftune_noise_alpha: 5
      early_stopping_patience: 2
      val_split: 0.05
    description: >
      EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
      alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
    eval_profile: french
    goals:
      task: french_bench_xnli
      min_improve: 0.01
      guard_tasks:
        - task: hellaswag
          max_regress: 0.03
    publish:
      hub_repo: MSGEncrypted/minicpm5-1b-french-lora
      mirror_repos:
        - build-small-hackathon/minicpm5-1b-french-lora
      private: false