Spaces:
Sleeping
Sleeping
MSG msgencrypted-auto commited on
Commit ·
bbff1ca
1
Parent(s): 28543d3
Feat/last hour (#24)
Browse files* eval experiment and profiles
* experiments
* french language
* french language
* evals
* evals
---------
Co-authored-by: msgencrypted-auto <msgencrypted.auto@gmail.com>
- research/data/build_language_lesson_chat.py +42 -0
- research/evals/configs/eval_profiles.yaml +53 -0
- research/evals/configs/lm_eval_commonsense.yaml +20 -0
- research/evals/configs/lm_eval_french.yaml +21 -0
- research/evals/configs/lm_eval_medical.yaml +20 -0
- research/evals/configs/lm_eval_multilingual.yaml +20 -0
- research/evals/configs/lm_eval_safety.yaml +22 -0
- research/evals/docs/eval_profiles.md +4 -0
- research/finetune.py +11 -5
- research/modal/README.md +2 -0
- research/modal/experiments.yaml +217 -17
research/data/build_language_lesson_chat.py
CHANGED
|
@@ -43,9 +43,11 @@ MAX_ASSISTANT_CHARS = 600
|
|
| 43 |
EVAL_HOLDOUT_RATIO = 0.05
|
| 44 |
|
| 45 |
DEFAULT_FR_SOURCES = (
|
|
|
|
| 46 |
"angeluriot/french_instruct",
|
| 47 |
"CohereLabs/aya_dataset",
|
| 48 |
"pinzhenchen/alpaca-cleaned-fr",
|
|
|
|
| 49 |
)
|
| 50 |
DEFAULT_AR_SOURCES = (
|
| 51 |
"arbml/CIDAR",
|
|
@@ -54,9 +56,11 @@ DEFAULT_AR_SOURCES = (
|
|
| 54 |
)
|
| 55 |
|
| 56 |
SOURCE_CAPS: dict[str, dict[str, int]] = {
|
|
|
|
| 57 |
"angeluriot/french_instruct": {"fr": 8000},
|
| 58 |
"CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
|
| 59 |
"pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
|
|
|
|
| 60 |
"arbml/CIDAR": {"ar": 8000},
|
| 61 |
"ClusterlabAi/InstAr-500k": {"ar": 5000},
|
| 62 |
}
|
|
@@ -138,6 +142,23 @@ def _load_seeds(path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]
|
|
| 138 |
return fr_rows, ar_rows
|
| 139 |
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 142 |
from datasets import load_dataset
|
| 143 |
|
|
@@ -197,6 +218,25 @@ def _iter_alpaca_fr(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
|
| 197 |
break
|
| 198 |
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 201 |
from datasets import load_dataset
|
| 202 |
|
|
@@ -235,12 +275,14 @@ def _iter_instar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
|
| 235 |
|
| 236 |
|
| 237 |
_SOURCE_LOADERS: dict[str, dict[str, Any]] = {
|
|
|
|
| 238 |
"angeluriot/french_instruct": {"fr": _iter_french_instruct},
|
| 239 |
"CohereLabs/aya_dataset": {
|
| 240 |
"fr": lambda n: _iter_aya("fra", n),
|
| 241 |
"ar": lambda n: _iter_aya("arb", n),
|
| 242 |
},
|
| 243 |
"pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
|
|
|
|
| 244 |
"arbml/CIDAR": {"ar": _iter_cidar},
|
| 245 |
"ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
|
| 246 |
}
|
|
|
|
| 43 |
EVAL_HOLDOUT_RATIO = 0.05
|
| 44 |
|
| 45 |
DEFAULT_FR_SOURCES = (
|
| 46 |
+
"FrancophonIA/english_french",
|
| 47 |
"angeluriot/french_instruct",
|
| 48 |
"CohereLabs/aya_dataset",
|
| 49 |
"pinzhenchen/alpaca-cleaned-fr",
|
| 50 |
+
"jpacifico/French-Alpaca-dataset-Instruct-110K",
|
| 51 |
)
|
| 52 |
DEFAULT_AR_SOURCES = (
|
| 53 |
"arbml/CIDAR",
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
SOURCE_CAPS: dict[str, dict[str, int]] = {
|
| 59 |
+
"FrancophonIA/english_french": {"fr": 4000},
|
| 60 |
"angeluriot/french_instruct": {"fr": 8000},
|
| 61 |
"CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
|
| 62 |
"pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
|
| 63 |
+
"jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": 4000},
|
| 64 |
"arbml/CIDAR": {"ar": 8000},
|
| 65 |
"ClusterlabAi/InstAr-500k": {"ar": 5000},
|
| 66 |
}
|
|
|
|
| 142 |
return fr_rows, ar_rows
|
| 143 |
|
| 144 |
|
| 145 |
+
def _iter_english_french(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 146 |
+
"""EN→FR parallel sentences — user asks in English, coach replies in French."""
|
| 147 |
+
from datasets import load_dataset
|
| 148 |
+
|
| 149 |
+
ds = load_dataset("FrancophonIA/english_french", split="train", streaming=True)
|
| 150 |
+
count = 0
|
| 151 |
+
for row in ds:
|
| 152 |
+
english = (row.get("english") or "").strip()
|
| 153 |
+
french = (row.get("french") or "").strip()
|
| 154 |
+
if english and _assistant_ok(french):
|
| 155 |
+
user = f"Translate the following to French:\n{english}"
|
| 156 |
+
yield user, french, None
|
| 157 |
+
count += 1
|
| 158 |
+
if count >= max_rows:
|
| 159 |
+
break
|
| 160 |
+
|
| 161 |
+
|
| 162 |
def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 163 |
from datasets import load_dataset
|
| 164 |
|
|
|
|
| 218 |
break
|
| 219 |
|
| 220 |
|
| 221 |
+
def _iter_french_alpaca_110k(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 222 |
+
from datasets import load_dataset
|
| 223 |
+
|
| 224 |
+
ds = load_dataset(
|
| 225 |
+
"jpacifico/French-Alpaca-dataset-Instruct-110K", split="train", streaming=True
|
| 226 |
+
)
|
| 227 |
+
count = 0
|
| 228 |
+
for row in ds:
|
| 229 |
+
instruction = (row.get("instruction") or "").strip()
|
| 230 |
+
inp = (row.get("input") or "").strip()
|
| 231 |
+
output = (row.get("output") or "").strip()
|
| 232 |
+
user_text = f"{instruction}\n{inp}".strip() if inp else instruction
|
| 233 |
+
if user_text and _assistant_ok(output):
|
| 234 |
+
yield user_text, output, None
|
| 235 |
+
count += 1
|
| 236 |
+
if count >= max_rows:
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
|
| 240 |
def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
|
| 241 |
from datasets import load_dataset
|
| 242 |
|
|
|
|
| 275 |
|
| 276 |
|
| 277 |
_SOURCE_LOADERS: dict[str, dict[str, Any]] = {
|
| 278 |
+
"FrancophonIA/english_french": {"fr": _iter_english_french},
|
| 279 |
"angeluriot/french_instruct": {"fr": _iter_french_instruct},
|
| 280 |
"CohereLabs/aya_dataset": {
|
| 281 |
"fr": lambda n: _iter_aya("fra", n),
|
| 282 |
"ar": lambda n: _iter_aya("arb", n),
|
| 283 |
},
|
| 284 |
"pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
|
| 285 |
+
"jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": _iter_french_alpaca_110k},
|
| 286 |
"arbml/CIDAR": {"ar": _iter_cidar},
|
| 287 |
"ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
|
| 288 |
}
|
research/evals/configs/eval_profiles.yaml
CHANGED
|
@@ -72,6 +72,59 @@ profiles:
|
|
| 72 |
tasks:
|
| 73 |
- ifeval
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
general_slm:
|
| 76 |
tool: slm-lm-eval
|
| 77 |
claim: General ~1B SLM baseline
|
|
|
|
| 72 |
tasks:
|
| 73 |
- ifeval
|
| 74 |
|
| 75 |
+
medical:
|
| 76 |
+
tool: slm-lm-eval
|
| 77 |
+
claim: Better medical knowledge
|
| 78 |
+
description: Clinical Q&A — PubMedQA + MedMCQA + MedQA (USMLE) with arc guard.
|
| 79 |
+
config: lm_eval_medical.yaml
|
| 80 |
+
tasks:
|
| 81 |
+
- pubmedqa
|
| 82 |
+
- medmcqa
|
| 83 |
+
- medqa_4options
|
| 84 |
+
- arc_challenge
|
| 85 |
+
|
| 86 |
+
multilingual:
|
| 87 |
+
tool: slm-lm-eval
|
| 88 |
+
claim: Better multilingual understanding
|
| 89 |
+
description: Cross-lingual NLI / commonsense / coreference (XNLI, XCOPA, XWinograd).
|
| 90 |
+
config: lm_eval_multilingual.yaml
|
| 91 |
+
tasks:
|
| 92 |
+
- xnli
|
| 93 |
+
- xcopa
|
| 94 |
+
- xwinograd
|
| 95 |
+
|
| 96 |
+
commonsense:
|
| 97 |
+
tool: slm-lm-eval
|
| 98 |
+
claim: Better commonsense reasoning
|
| 99 |
+
description: Everyday-knowledge MCQ + coreference + physical commonsense.
|
| 100 |
+
config: lm_eval_commonsense.yaml
|
| 101 |
+
tasks:
|
| 102 |
+
- commonsense_qa
|
| 103 |
+
- winogrande
|
| 104 |
+
- piqa
|
| 105 |
+
- hellaswag
|
| 106 |
+
|
| 107 |
+
safety:
|
| 108 |
+
tool: slm-lm-eval
|
| 109 |
+
claim: More truthful, fewer imitative falsehoods
|
| 110 |
+
description: TruthfulQA MC2/MC1 (eval-only; do not train on the test set).
|
| 111 |
+
config: lm_eval_safety.yaml
|
| 112 |
+
tasks:
|
| 113 |
+
- truthfulqa_mc2
|
| 114 |
+
- truthfulqa_mc1
|
| 115 |
+
- arc_easy
|
| 116 |
+
|
| 117 |
+
french:
|
| 118 |
+
tool: slm-lm-eval
|
| 119 |
+
claim: Better French understanding and translation
|
| 120 |
+
description: Official FrenchBench MC tasks + WMT14 EN→FR (CroissantLLM benchmark suite).
|
| 121 |
+
config: lm_eval_french.yaml
|
| 122 |
+
tasks:
|
| 123 |
+
- french_bench_xnli
|
| 124 |
+
- belebele_fra_Latn
|
| 125 |
+
- french_bench_boolqa
|
| 126 |
+
- wmt14-en-fr
|
| 127 |
+
|
| 128 |
general_slm:
|
| 129 |
tool: slm-lm-eval
|
| 130 |
claim: General ~1B SLM baseline
|
research/evals/configs/lm_eval_commonsense.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Commonsense profile — everyday reasoning, coreference, causal commonsense
|
| 2 |
+
# Run: slm-lm-eval --profile commonsense --preset minicpm5-1b --experiment-name commonsense-baseline
|
| 3 |
+
|
| 4 |
+
profile: commonsense
|
| 5 |
+
claim: Better commonsense reasoning
|
| 6 |
+
|
| 7 |
+
tasks:
|
| 8 |
+
- commonsense_qa # 5-way everyday-knowledge MCQ (gate task)
|
| 9 |
+
- winogrande # pronoun-resolution commonsense
|
| 10 |
+
- piqa # physical commonsense (general-capability guard)
|
| 11 |
+
- hellaswag # grounded commonsense guard
|
| 12 |
+
|
| 13 |
+
num_fewshot: 0
|
| 14 |
+
limit: 200
|
| 15 |
+
seed: 42
|
| 16 |
+
batch_size: auto
|
| 17 |
+
device: auto
|
| 18 |
+
dtype: bfloat16
|
| 19 |
+
trust_remote_code: true
|
| 20 |
+
output_dir: results/lm_eval
|
research/evals/configs/lm_eval_french.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# French profile — official FrenchBench (CroissantLLM) + EN→FR translation
|
| 2 |
+
# Pairs with french-lora (FrancophonIA/english_french). Run:
|
| 3 |
+
# slm-lm-eval --profile french --preset minicpm5-1b --experiment-name french-baseline
|
| 4 |
+
|
| 5 |
+
profile: french
|
| 6 |
+
claim: Better French understanding and translation
|
| 7 |
+
|
| 8 |
+
tasks:
|
| 9 |
+
- french_bench_xnli # French NLI (multiple choice; FrenchBench official)
|
| 10 |
+
- belebele_fra_Latn # French reading comprehension (FLORES-200 based)
|
| 11 |
+
- french_bench_boolqa # French boolean QA
|
| 12 |
+
- wmt14-en-fr # WMT14 English→French translation (BLEU)
|
| 13 |
+
|
| 14 |
+
num_fewshot: 0
|
| 15 |
+
limit: 100
|
| 16 |
+
seed: 42
|
| 17 |
+
batch_size: auto
|
| 18 |
+
device: auto
|
| 19 |
+
dtype: bfloat16
|
| 20 |
+
trust_remote_code: true
|
| 21 |
+
output_dir: results/lm_eval
|
research/evals/configs/lm_eval_medical.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Medical profile — clinical Q&A fact recall + reasoning
|
| 2 |
+
# Run: slm-lm-eval --profile medical --preset minicpm5-1b --experiment-name medical-baseline
|
| 3 |
+
|
| 4 |
+
profile: medical
|
| 5 |
+
claim: Better medical knowledge
|
| 6 |
+
|
| 7 |
+
tasks:
|
| 8 |
+
- pubmedqa # yes/no/maybe over biomedical abstracts (gate task)
|
| 9 |
+
- medmcqa # multi-subject medical entrance-exam MCQ
|
| 10 |
+
- medqa_4options # USMLE-style 4-option clinical MCQ
|
| 11 |
+
- arc_challenge # general-capability guard (catch regression from skill tuning)
|
| 12 |
+
|
| 13 |
+
num_fewshot: null # per-task canonical fewshot
|
| 14 |
+
limit: 200 # larger sample -> tighter stderr for gate decisions
|
| 15 |
+
seed: 42
|
| 16 |
+
batch_size: auto
|
| 17 |
+
device: auto
|
| 18 |
+
dtype: bfloat16
|
| 19 |
+
trust_remote_code: true
|
| 20 |
+
output_dir: results/lm_eval
|
research/evals/configs/lm_eval_multilingual.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multilingual profile — cross-lingual NLI / commonsense / coreference
|
| 2 |
+
# Pairs with the FR/AR language-lesson adapter. Run:
|
| 3 |
+
# slm-lm-eval --profile multilingual --preset minicpm5-1b --experiment-name multilingual-baseline
|
| 4 |
+
|
| 5 |
+
profile: multilingual
|
| 6 |
+
claim: Better multilingual understanding
|
| 7 |
+
|
| 8 |
+
tasks:
|
| 9 |
+
- xnli # cross-lingual natural-language inference (15 langs incl. fr/ar)
|
| 10 |
+
- xcopa # cross-lingual causal commonsense
|
| 11 |
+
- xwinograd # cross-lingual coreference (Winograd schema)
|
| 12 |
+
|
| 13 |
+
num_fewshot: 0
|
| 14 |
+
limit: 100
|
| 15 |
+
seed: 42
|
| 16 |
+
batch_size: auto
|
| 17 |
+
device: auto
|
| 18 |
+
dtype: bfloat16
|
| 19 |
+
trust_remote_code: true
|
| 20 |
+
output_dir: results/lm_eval
|
research/evals/configs/lm_eval_safety.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Safety / truthfulness profile — resist imitative falsehoods
|
| 2 |
+
# EVAL-ONLY: do NOT fine-tune on TruthfulQA (it is the test set — contamination).
|
| 3 |
+
# Improve it indirectly via high-quality helpful/honest instruction data, then
|
| 4 |
+
# measure here. Run:
|
| 5 |
+
# slm-lm-eval --profile safety --preset minicpm5-1b --experiment-name safety-baseline
|
| 6 |
+
|
| 7 |
+
profile: safety
|
| 8 |
+
claim: More truthful, fewer imitative falsehoods
|
| 9 |
+
|
| 10 |
+
tasks:
|
| 11 |
+
- truthfulqa_mc2 # multi-true MC truthfulness (primary)
|
| 12 |
+
- truthfulqa_mc1 # single-true MC truthfulness
|
| 13 |
+
- arc_easy # general-capability guard
|
| 14 |
+
|
| 15 |
+
num_fewshot: 0
|
| 16 |
+
limit: 200
|
| 17 |
+
seed: 42
|
| 18 |
+
batch_size: auto
|
| 19 |
+
device: auto
|
| 20 |
+
dtype: bfloat16
|
| 21 |
+
trust_remote_code: true
|
| 22 |
+
output_dir: results/lm_eval
|
research/evals/docs/eval_profiles.md
CHANGED
|
@@ -51,6 +51,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
|
|
| 51 |
| Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
|
| 52 |
| Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
|
| 53 |
| Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
|
|
|
|
|
|
|
| 54 |
| General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
|
| 55 |
| Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
|
| 56 |
| Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
|
|
@@ -72,6 +74,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
|
|
| 72 |
| `understanding` | `lm_eval_understanding.yaml` |
|
| 73 |
| `code` | `lm_eval_code.yaml` |
|
| 74 |
| `instructions` | `lm_eval_instructions.yaml` |
|
|
|
|
|
|
|
| 75 |
| `general_slm` | `lm_eval_minicpm5.yaml` |
|
| 76 |
| `compare_study` | `lm_eval_compare_study.yaml` |
|
| 77 |
|
|
|
|
| 51 |
| Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
|
| 52 |
| Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
|
| 53 |
| Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
|
| 54 |
+
| Better French / translation | `french` | `slm-lm-eval` | `french_bench_xnli`, `belebele_fra_Latn`, `wmt14-en-fr`, … |
|
| 55 |
+
| Better multilingual understanding | `multilingual` | `slm-lm-eval` | `xnli`, `xcopa`, `xwinograd` |
|
| 56 |
| General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
|
| 57 |
| Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
|
| 58 |
| Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
|
|
|
|
| 74 |
| `understanding` | `lm_eval_understanding.yaml` |
|
| 75 |
| `code` | `lm_eval_code.yaml` |
|
| 76 |
| `instructions` | `lm_eval_instructions.yaml` |
|
| 77 |
+
| `french` | `lm_eval_french.yaml` |
|
| 78 |
+
| `multilingual` | `lm_eval_multilingual.yaml` |
|
| 79 |
| `general_slm` | `lm_eval_minicpm5.yaml` |
|
| 80 |
| `compare_study` | `lm_eval_compare_study.yaml` |
|
| 81 |
|
research/finetune.py
CHANGED
|
@@ -471,12 +471,13 @@ def save_training_results(
|
|
| 471 |
return path
|
| 472 |
|
| 473 |
|
| 474 |
-
def to_prompt_response(example, fmt, tokenizer, keys=None):
|
| 475 |
"""Normalize any supported format into a single training string,
|
| 476 |
returning (full_text, prompt_text). prompt_text is None for raw text.
|
| 477 |
|
| 478 |
`keys` optionally remaps a dataset's column names onto the format's
|
| 479 |
-
expected fields (e.g. {"prompt": "query"} for MetaMathQA).
|
|
|
|
| 480 |
keys = keys or {}
|
| 481 |
if fmt == "text":
|
| 482 |
return example[keys.get("text", "text")], None
|
|
@@ -491,6 +492,8 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
|
|
| 491 |
|
| 492 |
elif fmt == "prompt":
|
| 493 |
prompt = example.get(keys.get("prompt", "prompt"), "")
|
|
|
|
|
|
|
| 494 |
rkey = keys.get("response")
|
| 495 |
resp = example.get(rkey, "") if rkey else example.get(
|
| 496 |
"completion", example.get("response", ""))
|
|
@@ -517,9 +520,10 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
|
|
| 517 |
return full, prompt_only
|
| 518 |
|
| 519 |
|
| 520 |
-
def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None):
|
| 521 |
def fn(example):
|
| 522 |
-
full, prompt = to_prompt_response(
|
|
|
|
| 523 |
ids = tokenizer(full, truncation=True, max_length=max_len,
|
| 524 |
add_special_tokens=(fmt == "text"))["input_ids"]
|
| 525 |
labels = list(ids)
|
|
@@ -593,7 +597,9 @@ def build_training_dataset(args, tokenizer):
|
|
| 593 |
raw = raw.shuffle(seed=args.seed)
|
| 594 |
keys = spec.get("columns") or {}
|
| 595 |
max_len = spec.get("max_len", args.max_len)
|
| 596 |
-
|
|
|
|
|
|
|
| 597 |
tok = raw.map(tokenize, remove_columns=raw.column_names,
|
| 598 |
desc=f"tokenizing {dataset}")
|
| 599 |
tok = tok.filter(lambda e: len(e["input_ids"]) > 1)
|
|
|
|
| 471 |
return path
|
| 472 |
|
| 473 |
|
| 474 |
+
def to_prompt_response(example, fmt, tokenizer, keys=None, prompt_prefix=None):
|
| 475 |
"""Normalize any supported format into a single training string,
|
| 476 |
returning (full_text, prompt_text). prompt_text is None for raw text.
|
| 477 |
|
| 478 |
`keys` optionally remaps a dataset's column names onto the format's
|
| 479 |
+
expected fields (e.g. {"prompt": "query"} for MetaMathQA).
|
| 480 |
+
`prompt_prefix` prepends fixed instruction text to prompt-format user turns."""
|
| 481 |
keys = keys or {}
|
| 482 |
if fmt == "text":
|
| 483 |
return example[keys.get("text", "text")], None
|
|
|
|
| 492 |
|
| 493 |
elif fmt == "prompt":
|
| 494 |
prompt = example.get(keys.get("prompt", "prompt"), "")
|
| 495 |
+
if prompt_prefix:
|
| 496 |
+
prompt = f"{prompt_prefix}{prompt}"
|
| 497 |
rkey = keys.get("response")
|
| 498 |
resp = example.get(rkey, "") if rkey else example.get(
|
| 499 |
"completion", example.get("response", ""))
|
|
|
|
| 520 |
return full, prompt_only
|
| 521 |
|
| 522 |
|
| 523 |
+
def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None, prompt_prefix=None):
|
| 524 |
def fn(example):
|
| 525 |
+
full, prompt = to_prompt_response(
|
| 526 |
+
example, fmt, tokenizer, keys, prompt_prefix=prompt_prefix)
|
| 527 |
ids = tokenizer(full, truncation=True, max_length=max_len,
|
| 528 |
add_special_tokens=(fmt == "text"))["input_ids"]
|
| 529 |
labels = list(ids)
|
|
|
|
| 597 |
raw = raw.shuffle(seed=args.seed)
|
| 598 |
keys = spec.get("columns") or {}
|
| 599 |
max_len = spec.get("max_len", args.max_len)
|
| 600 |
+
prefix = spec.get("prompt_prefix")
|
| 601 |
+
tokenize = build_tokenize_fn(
|
| 602 |
+
tokenizer, fmt, max_len, args.mask_prompt, keys, prompt_prefix=prefix)
|
| 603 |
tok = raw.map(tokenize, remove_columns=raw.column_names,
|
| 604 |
desc=f"tokenizing {dataset}")
|
| 605 |
tok = tok.filter(lambda e: len(e["input_ids"]) > 1)
|
research/modal/README.md
CHANGED
|
@@ -101,6 +101,8 @@ QLoRA adapter per category, each evaluated against the matching
|
|
| 101 |
| `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
|
| 102 |
| `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
|
| 103 |
| `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
|
|
|
|
|
|
|
| 104 |
| `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
|
| 105 |
|
| 106 |
Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`
|
|
|
|
| 101 |
| `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
|
| 102 |
| `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
|
| 103 |
| `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
|
| 104 |
+
| `language-lesson-lora` | language | `language-lesson-fr/ar.jsonl` (`chat`) | `multilingual` | `xnli` (+ `hellaswag` guard) | ✅ |
|
| 105 |
+
| `french-lora` | french | `FrancophonIA/english_french` (`prompt`) + FR chat | `french` | `french_bench_xnli` (+ `hellaswag` guard) | ✅ |
|
| 106 |
| `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
|
| 107 |
|
| 108 |
Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`
|
research/modal/experiments.yaml
CHANGED
|
@@ -81,28 +81,63 @@ finetune:
|
|
| 81 |
- build-small-hackathon/minicpm5-1b-teaching-lora
|
| 82 |
private: false
|
| 83 |
|
| 84 |
-
# --- science:
|
| 85 |
-
#
|
| 86 |
-
#
|
| 87 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
- name: science-lora
|
| 89 |
category: science
|
| 90 |
-
max_steps:
|
| 91 |
mix:
|
| 92 |
-
- dataset:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
format: chat
|
| 94 |
-
weight:
|
| 95 |
-
- dataset:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
format: alpaca
|
| 97 |
-
dataset_split: "train[:
|
| 98 |
-
max_samples:
|
| 99 |
args:
|
| 100 |
-
lora_r:
|
| 101 |
-
lora_alpha:
|
| 102 |
-
|
| 103 |
-
early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
|
| 104 |
val_split: 0.05
|
| 105 |
-
description:
|
|
|
|
|
|
|
| 106 |
eval_profile: science
|
| 107 |
goals:
|
| 108 |
task: sciq
|
|
@@ -214,6 +249,127 @@ finetune:
|
|
| 214 |
- build-small-hackathon/minicpm5-1b-reasoning-lora
|
| 215 |
private: false
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
# --- general instructions baseline: no goals/publish -> local-only adapter ---
|
| 218 |
- name: alpaca-lora
|
| 219 |
category: instructions
|
|
@@ -252,9 +408,9 @@ finetune:
|
|
| 252 |
description: >
|
| 253 |
FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
|
| 254 |
build_language_lesson_chat.py) + English replay
|
| 255 |
-
eval_profile:
|
| 256 |
goals:
|
| 257 |
-
task:
|
| 258 |
min_improve: 0.0
|
| 259 |
guard_tasks:
|
| 260 |
- task: hellaswag
|
|
@@ -264,3 +420,47 @@ finetune:
|
|
| 264 |
mirror_repos:
|
| 265 |
- build-small-hackathon/minicpm5-1b-language-lesson-lora
|
| 266 |
private: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
- build-small-hackathon/minicpm5-1b-teaching-lora
|
| 82 |
private: false
|
| 83 |
|
| 84 |
+
# --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
|
| 85 |
+
# Previous attempt used chat-format tutoring — wrong signal for MC benchmarks.
|
| 86 |
+
# Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
|
| 87 |
+
# allenai/sciq train: 11k factual science MC (question→correct_answer).
|
| 88 |
+
# allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
|
| 89 |
+
# allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
|
| 90 |
+
# Local science-tutor-chat kept at low weight for style/explanation diversity.
|
| 91 |
+
# MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
|
| 92 |
+
# Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
|
| 93 |
- name: science-lora
|
| 94 |
category: science
|
| 95 |
+
max_steps: 120
|
| 96 |
mix:
|
| 97 |
+
- dataset: allenai/sciq # 11k MC science Q→A (in-distribution with sciq eval)
|
| 98 |
+
format: prompt
|
| 99 |
+
columns:
|
| 100 |
+
prompt: question
|
| 101 |
+
response: correct_answer
|
| 102 |
+
dataset_split: "train[:1500]"
|
| 103 |
+
max_samples: 1500
|
| 104 |
+
- dataset: allenai/ai2_arc # elementary + challenge science MC
|
| 105 |
+
format: prompt
|
| 106 |
+
dataset_config: ARC-Easy
|
| 107 |
+
columns:
|
| 108 |
+
prompt: question
|
| 109 |
+
response: answerKey
|
| 110 |
+
dataset_split: "train[:500]"
|
| 111 |
+
max_samples: 500
|
| 112 |
+
- dataset: allenai/openbookqa # fact-based open science Q&A
|
| 113 |
+
format: prompt
|
| 114 |
+
columns:
|
| 115 |
+
prompt: question_stem
|
| 116 |
+
response: answerKey
|
| 117 |
+
dataset_split: "train[:400]"
|
| 118 |
+
max_samples: 400
|
| 119 |
+
- dataset: research/data/science-tutor-chat.jsonl # style diversity
|
| 120 |
format: chat
|
| 121 |
+
weight: 4
|
| 122 |
+
- dataset: meta-math/MetaMathQA # gsm8k guard protection
|
| 123 |
+
format: prompt
|
| 124 |
+
columns:
|
| 125 |
+
prompt: query
|
| 126 |
+
response: response
|
| 127 |
+
dataset_split: "train[:200]"
|
| 128 |
+
max_samples: 200
|
| 129 |
+
- dataset: tatsu-lab/alpaca # general replay: protect hellaswag/piqa/boolq
|
| 130 |
format: alpaca
|
| 131 |
+
dataset_split: "train[:400]"
|
| 132 |
+
max_samples: 400
|
| 133 |
args:
|
| 134 |
+
lora_r: 16
|
| 135 |
+
lora_alpha: 32
|
| 136 |
+
early_stopping_patience: 3
|
|
|
|
| 137 |
val_split: 0.05
|
| 138 |
+
description: >
|
| 139 |
+
sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
|
| 140 |
+
guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
|
| 141 |
eval_profile: science
|
| 142 |
goals:
|
| 143 |
task: sciq
|
|
|
|
| 249 |
- build-small-hackathon/minicpm5-1b-reasoning-lora
|
| 250 |
private: false
|
| 251 |
|
| 252 |
+
# --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
|
| 253 |
+
# New vertical. Same overfit-guard recipe as teaching/science: a focused
|
| 254 |
+
# skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
|
| 255 |
+
# improve without regressing the arc_challenge general-knowledge guard.
|
| 256 |
+
- name: medical-lora
|
| 257 |
+
category: medical
|
| 258 |
+
max_steps: 200
|
| 259 |
+
mix:
|
| 260 |
+
- dataset: medalpaca/medical_meadow_medqa # USMLE-style QA, alpaca columns
|
| 261 |
+
format: alpaca
|
| 262 |
+
dataset_split: "train[:2000]"
|
| 263 |
+
max_samples: 2000
|
| 264 |
+
- dataset: tatsu-lab/alpaca # general replay: protect guards
|
| 265 |
+
format: alpaca
|
| 266 |
+
dataset_split: "train[:600]"
|
| 267 |
+
max_samples: 600
|
| 268 |
+
args:
|
| 269 |
+
lora_r: 32
|
| 270 |
+
lora_alpha: 64
|
| 271 |
+
neftune_noise_alpha: 5
|
| 272 |
+
early_stopping_patience: 2
|
| 273 |
+
val_split: 0.05
|
| 274 |
+
description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
|
| 275 |
+
eval_profile: medical
|
| 276 |
+
goals:
|
| 277 |
+
task: pubmedqa
|
| 278 |
+
min_score: 0.45
|
| 279 |
+
min_improve: 0.02
|
| 280 |
+
guard_tasks:
|
| 281 |
+
- task: arc_challenge
|
| 282 |
+
max_regress: 0.03
|
| 283 |
+
publish:
|
| 284 |
+
hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
|
| 285 |
+
mirror_repos:
|
| 286 |
+
- build-small-hackathon/minicpm5-1b-medical-lora
|
| 287 |
+
private: false
|
| 288 |
+
|
| 289 |
+
# --- tool-use: function/tool-calling (xLAM) ---
|
| 290 |
+
# New vertical that closes the loop with the existing BFCL agentic benchmark.
|
| 291 |
+
# The publish gate guards general ability (lm-eval has no function-call task);
|
| 292 |
+
# the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark:
|
| 293 |
+
# uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
|
| 294 |
+
- name: tool-use-lora
|
| 295 |
+
category: tool_use
|
| 296 |
+
max_steps: 200
|
| 297 |
+
mix:
|
| 298 |
+
- dataset: Salesforce/xlam-function-calling-60k
|
| 299 |
+
format: prompt
|
| 300 |
+
columns:
|
| 301 |
+
prompt: query
|
| 302 |
+
response: answers # JSON function-call(s) the model must emit
|
| 303 |
+
dataset_split: "train[:3000]"
|
| 304 |
+
max_samples: 3000
|
| 305 |
+
- dataset: tatsu-lab/alpaca # general replay: protect guards
|
| 306 |
+
format: alpaca
|
| 307 |
+
dataset_split: "train[:600]"
|
| 308 |
+
max_samples: 600
|
| 309 |
+
args:
|
| 310 |
+
lora_r: 32
|
| 311 |
+
lora_alpha: 64
|
| 312 |
+
neftune_noise_alpha: 5
|
| 313 |
+
early_stopping_patience: 2
|
| 314 |
+
val_split: 0.05
|
| 315 |
+
description: >
|
| 316 |
+
Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
|
| 317 |
+
BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
|
| 318 |
+
eval_profile: compare_study
|
| 319 |
+
goals:
|
| 320 |
+
task: arc_easy
|
| 321 |
+
min_improve: 0.0
|
| 322 |
+
guard_tasks:
|
| 323 |
+
- task: hellaswag
|
| 324 |
+
max_regress: 0.03
|
| 325 |
+
- task: piqa
|
| 326 |
+
max_regress: 0.03
|
| 327 |
+
publish:
|
| 328 |
+
hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
|
| 329 |
+
mirror_repos:
|
| 330 |
+
- build-small-hackathon/minicpm5-1b-tool-use-lora
|
| 331 |
+
private: false
|
| 332 |
+
|
| 333 |
+
# --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
|
| 334 |
+
# New vertical. In-distribution MC train (question -> answerKey), same recipe
|
| 335 |
+
# as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
|
| 336 |
+
- name: commonsense-lora
|
| 337 |
+
category: commonsense
|
| 338 |
+
max_steps: 150
|
| 339 |
+
mix:
|
| 340 |
+
- dataset: tau/commonsense_qa # 5-way everyday-knowledge MCQ, in-distribution
|
| 341 |
+
format: prompt
|
| 342 |
+
columns:
|
| 343 |
+
prompt: question
|
| 344 |
+
response: answerKey
|
| 345 |
+
dataset_split: "train[:2000]"
|
| 346 |
+
max_samples: 2000
|
| 347 |
+
- dataset: tatsu-lab/alpaca # general replay: protect piqa/hellaswag guards
|
| 348 |
+
format: alpaca
|
| 349 |
+
dataset_split: "train[:600]"
|
| 350 |
+
max_samples: 600
|
| 351 |
+
args:
|
| 352 |
+
lora_r: 16
|
| 353 |
+
lora_alpha: 32
|
| 354 |
+
early_stopping_patience: 2
|
| 355 |
+
val_split: 0.05
|
| 356 |
+
description: CommonsenseQA MC train + alpaca replay, r=16
|
| 357 |
+
eval_profile: commonsense
|
| 358 |
+
goals:
|
| 359 |
+
task: commonsense_qa
|
| 360 |
+
min_score: 0.30
|
| 361 |
+
min_improve: 0.02
|
| 362 |
+
guard_tasks:
|
| 363 |
+
- task: piqa
|
| 364 |
+
max_regress: 0.03
|
| 365 |
+
- task: hellaswag
|
| 366 |
+
max_regress: 0.03
|
| 367 |
+
publish:
|
| 368 |
+
hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
|
| 369 |
+
mirror_repos:
|
| 370 |
+
- build-small-hackathon/minicpm5-1b-commonsense-lora
|
| 371 |
+
private: false
|
| 372 |
+
|
| 373 |
# --- general instructions baseline: no goals/publish -> local-only adapter ---
|
| 374 |
- name: alpaca-lora
|
| 375 |
category: instructions
|
|
|
|
| 408 |
description: >
|
| 409 |
FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
|
| 410 |
build_language_lesson_chat.py) + English replay
|
| 411 |
+
eval_profile: multilingual
|
| 412 |
goals:
|
| 413 |
+
task: xnli
|
| 414 |
min_improve: 0.0
|
| 415 |
guard_tasks:
|
| 416 |
- task: hellaswag
|
|
|
|
| 420 |
mirror_repos:
|
| 421 |
- build-small-hackathon/minicpm5-1b-language-lesson-lora
|
| 422 |
private: false
|
| 423 |
+
|
| 424 |
+
# --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
|
| 425 |
+
# 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
|
| 426 |
+
# FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
|
| 427 |
+
- name: french-lora
|
| 428 |
+
category: french
|
| 429 |
+
max_steps: 150
|
| 430 |
+
mix:
|
| 431 |
+
- dataset: FrancophonIA/english_french
|
| 432 |
+
format: prompt
|
| 433 |
+
columns:
|
| 434 |
+
prompt: english
|
| 435 |
+
response: french
|
| 436 |
+
prompt_prefix: "Translate the following English sentence to French:\n"
|
| 437 |
+
dataset_split: "train[:3000]"
|
| 438 |
+
max_samples: 3000
|
| 439 |
+
- dataset: research/data/language-lesson-fr.jsonl
|
| 440 |
+
format: chat
|
| 441 |
+
weight: 6
|
| 442 |
+
- dataset: tatsu-lab/alpaca
|
| 443 |
+
format: alpaca
|
| 444 |
+
dataset_split: "train[:400]"
|
| 445 |
+
max_samples: 400
|
| 446 |
+
args:
|
| 447 |
+
lora_r: 32
|
| 448 |
+
lora_alpha: 64
|
| 449 |
+
neftune_noise_alpha: 5
|
| 450 |
+
early_stopping_patience: 2
|
| 451 |
+
val_split: 0.05
|
| 452 |
+
description: >
|
| 453 |
+
EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
|
| 454 |
+
alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
|
| 455 |
+
eval_profile: french
|
| 456 |
+
goals:
|
| 457 |
+
task: french_bench_xnli
|
| 458 |
+
min_improve: 0.01
|
| 459 |
+
guard_tasks:
|
| 460 |
+
- task: hellaswag
|
| 461 |
+
max_regress: 0.03
|
| 462 |
+
publish:
|
| 463 |
+
hub_repo: MSGEncrypted/minicpm5-1b-french-lora
|
| 464 |
+
mirror_repos:
|
| 465 |
+
- build-small-hackathon/minicpm5-1b-french-lora
|
| 466 |
+
private: false
|