Add SOTA advancement pipeline: multi-stage weighted curriculum + pass@k eval harness.

Browse files

Files changed (5) hide show

README.md +31 -0
configs/deepseek_math_sota.yaml +113 -0
requirements.txt +1 -0
scripts/eval_sota.py +299 -0
scripts/train_sota.py +688 -0

README.md CHANGED Viewed

@@ -24,8 +24,11 @@ model from the merged dataset in `data/releases/v1/`.
 - `configs/deepseek_math.yaml`: preset for `DeepSeek-Math`
 - `configs/deepseek_math_v2.yaml`: preset for `DeepSeek-Math-V2`
 - `scripts/train_sft.py`: LoRA/QLoRA supervised fine-tuning + optional Hub push
 - `scripts/merge_and_push.py`: optional adapter merge into full weights + Hub push
 - `requirements.txt`: model-training dependencies
 ## Setup
@@ -48,10 +51,38 @@ model from the merged dataset in `data/releases/v1/`.
   --config model_development/configs/deepseek_math_v2.yaml
 ```
 ## Important notes
 - Both presets point to `data/releases/v1/train.parquet` and
   `data/releases/v1/validation.parquet`.
 - If your exact v2 checkpoint id differs, update `model.base_model` in
   `model_development/configs/deepseek_math_v2.yaml`.
 - Hub auth uses `HF_TOKEN` first, then `huggingface-api-key.json`.

 - `configs/deepseek_math.yaml`: preset for `DeepSeek-Math`
 - `configs/deepseek_math_v2.yaml`: preset for `DeepSeek-Math-V2`
+- `configs/deepseek_math_sota.yaml`: multi-stage SOTA advancement recipe
 - `scripts/train_sft.py`: LoRA/QLoRA supervised fine-tuning + optional Hub push
+- `scripts/train_sota.py`: weighted multi-stage curriculum fine-tuning
 - `scripts/merge_and_push.py`: optional adapter merge into full weights + Hub push
+- `scripts/eval_sota.py`: self-consistency `pass@1` / `pass@k` evaluation harness
 - `requirements.txt`: model-training dependencies
 ## Setup
   --config model_development/configs/deepseek_math_v2.yaml
 ```
+## SOTA Advancement Recipe (Multi-stage)
+```bash
+.venv/bin/python model_development/scripts/train_sota.py \
+  --config model_development/configs/deepseek_math_sota.yaml
+```
+This recipe runs:
+- Stage 1: broad math bootstrap
+- Stage 2: conjecture + formal proof specialization
+- Stage 3: conjecture-core alignment
+and saves a final adapter under:
+- `model_development/runs/math-conjecture-sota/final_adapter`
+## Evaluate pass@k with self-consistency
+```bash
+.venv/bin/python model_development/scripts/eval_sota.py \
+  --config model_development/configs/deepseek_math_sota.yaml \
+  --adapter-path model_development/runs/math-conjecture-sota/final_adapter \
+  --eval-file data/releases/v1/test.parquet \
+  --k 4 \
+  --max-samples 300
+```
 ## Important notes
 - Both presets point to `data/releases/v1/train.parquet` and
   `data/releases/v1/validation.parquet`.
+- `deepseek_math_sota.yaml` defaults to `DeepSeek-Math-V2` and pushes to
+  `NorthernTribe-Research/math-conjecture-model`.
 - If your exact v2 checkpoint id differs, update `model.base_model` in
   `model_development/configs/deepseek_math_v2.yaml`.
 - Hub auth uses `HF_TOKEN` first, then `huggingface-api-key.json`.

configs/deepseek_math_sota.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+global:
+  output_root: model_development/runs/math-conjecture-sota
+  seed: 17
+model:
+  base_model: deepseek-ai/deepseek-math-v2
+  trust_remote_code: true
+  use_bf16: true
+  load_in_4bit: true
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+  attn_implementation: null
+  lora:
+    r: 96
+    alpha: 192
+    dropout: 0.05
+    bias: none
+    target_modules:
+      - q_proj
+      - k_proj
+      - v_proj
+      - o_proj
+      - gate_proj
+      - up_proj
+      - down_proj
+data:
+  default_train_file: data/releases/v1/train.parquet
+  default_validation_file: data/releases/v1/validation.parquet
+  prompt_field: prompt
+  target_field: target
+  final_answer_field: final_answer
+  proof_field: proof_formal
+  sample_weight_field: sample_weight
+  max_seq_length: 3072
+  min_loss_weight: 0.25
+  max_loss_weight: 6.0
+  family_boost:
+    conjecture_core: 2.5
+    formal_proof: 1.6
+    competition: 1.2
+    structured_reasoning: 1.0
+  system_prompt: |
+    You are a frontier mathematical reasoning model focused on unsolved
+    conjectures. Your outputs must be precise, technically coherent, and explicit
+    about uncertainty. Never claim a full proof unless it is derivable from given
+    assumptions or already established in cited prior results.
+training_defaults:
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 16
+  weight_decay: 0.01
+  warmup_ratio: 0.03
+  lr_scheduler_type: cosine
+  max_grad_norm: 1.0
+  gradient_checkpointing: true
+  logging_steps: 10
+  save_steps: 400
+  eval_steps: 400
+  save_total_limit: 3
+  dataloader_num_workers: 2
+stages:
+  - name: broad_math_bootstrap
+    max_train_samples: null
+    max_eval_samples: 3000
+    filters:
+      include_families:
+        - competition
+        - structured_reasoning
+        - formal_proof
+        - conjecture_core
+    training:
+      num_train_epochs: 1
+      learning_rate: 2.0e-5
+  - name: conjecture_specialization
+    max_train_samples: null
+    max_eval_samples: 2000
+    filters:
+      include_families:
+        - conjecture_core
+        - formal_proof
+      min_sample_weight: 2.0
+    training:
+      num_train_epochs: 2
+      learning_rate: 8.0e-6
+      save_steps: 250
+      eval_steps: 250
+  - name: conjecture_alignment
+    max_train_samples: null
+    max_eval_samples: null
+    filters:
+      include_families:
+        - conjecture_core
+      require_conjecture_id: true
+    training:
+      num_train_epochs: 3
+      learning_rate: 5.0e-6
+      save_steps: 100
+      eval_steps: 100
+hub:
+  push_to_hub: true
+  repo_id: NorthernTribe-Research/math-conjecture-model
+  private: false
+  upload_stage_checkpoints: true
+  commit_message: Train multi-stage SOTA curriculum for conjecture reasoning.
+credentials:
+  path: huggingface-api-key.json

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ peft>=0.14.0
 bitsandbytes>=0.45.0
 huggingface_hub>=0.26.0
 pyyaml>=6.0.2

 bitsandbytes>=0.45.0
 huggingface_hub>=0.26.0
 pyyaml>=6.0.2
+sentencepiece>=0.2.0

scripts/eval_sota.py ADDED Viewed

	@@ -0,0 +1,299 @@

+#!/usr/bin/env python3
+"""Self-consistency evaluation for math-conjecture model checkpoints."""
+from __future__ import annotations
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence
+import torch
+import yaml
+from datasets import load_dataset
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run pass@k-style evaluation on held-out split.")
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path("model_development/configs/deepseek_math_sota.yaml"),
+        help="Training config used for prompt formatting defaults.",
+    )
+    parser.add_argument(
+        "--base-model",
+        type=str,
+        default=None,
+        help="Override base model id from config.",
+    )
+    parser.add_argument(
+        "--adapter-path",
+        type=Path,
+        default=None,
+        help="Optional LoRA adapter path to load on top of base model.",
+    )
+    parser.add_argument(
+        "--eval-file",
+        type=Path,
+        default=Path("data/releases/v1/test.parquet"),
+        help="Parquet split used for evaluation.",
+    )
+    parser.add_argument("--max-samples", type=int, default=300, help="Maximum evaluation rows.")
+    parser.add_argument("--k", type=int, default=4, help="Number of sampled generations per prompt.")
+    parser.add_argument("--max-new-tokens", type=int, default=256, help="Generation length cap.")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature.")
+    parser.add_argument("--top-p", type=float, default=0.95, help="Nucleus sampling p.")
+    parser.add_argument("--seed", type=int, default=17, help="Random seed.")
+    parser.add_argument(
+        "--output-json",
+        type=Path,
+        default=Path("model_development/runs/latest_eval_report.json"),
+        help="Where to write evaluation report.",
+    )
+    return parser.parse_args()
+def as_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value.strip()
+    return str(value).strip()
+def load_config(path: Path) -> Dict[str, Any]:
+    cfg = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if not isinstance(cfg, dict):
+        raise ValueError("Invalid YAML config.")
+    return cfg
+def normalize_answer(text: str) -> str:
+    text = text.strip().lower()
+    text = re.sub(r"\s+", " ", text)
+    text = text.replace("$", "")
+    return text
+def flatten_expected(row: Dict[str, Any], data_cfg: Dict[str, Any]) -> List[str]:
+    out: List[str] = []
+    final_field = as_text(data_cfg.get("final_answer_field")) or "final_answer"
+    target_field = as_text(data_cfg.get("target_field")) or "target"
+    final_answer = row.get(final_field)
+    if final_answer is not None:
+        txt = as_text(final_answer)
+        if txt:
+            out.append(txt)
+    target = row.get(target_field)
+    if target is None:
+        return out
+    if isinstance(target, str):
+        stripped = target.strip()
+        if not stripped:
+            return out
+        try:
+            target = json.loads(stripped)
+        except json.JSONDecodeError:
+            out.append(stripped)
+            return out
+    if isinstance(target, dict):
+        for value in target.values():
+            if isinstance(value, list):
+                for item in value:
+                    txt = as_text(item)
+                    if txt:
+                        out.append(txt)
+            else:
+                txt = as_text(value)
+                if txt:
+                    out.append(txt)
+    elif isinstance(target, list):
+        for item in target:
+            txt = as_text(item)
+            if txt:
+                out.append(txt)
+    else:
+        txt = as_text(target)
+        if txt:
+            out.append(txt)
+    return out
+def build_user_block(row: Dict[str, Any], data_cfg: Dict[str, Any]) -> str:
+    prompt_field = as_text(data_cfg.get("prompt_field")) or "prompt"
+    prompt = as_text(row.get(prompt_field))
+    if not prompt:
+        prompt = "Solve the math task."
+    meta_fields = [
+        ("task_type", "Task type"),
+        ("family", "Family"),
+        ("difficulty", "Difficulty"),
+        ("source_dataset", "Source"),
+        ("status_as_of", "Status as of"),
+    ]
+    lines = []
+    for key, label in meta_fields:
+        value = as_text(row.get(key))
+        if value:
+            lines.append(f"{label}: {value}")
+    if lines:
+        return f"{prompt}\n\nMetadata:\n" + "\n".join(lines)
+    return prompt
+def build_prompt_text(row: Dict[str, Any], tokenizer: AutoTokenizer, data_cfg: Dict[str, Any]) -> str:
+    system_prompt = as_text(data_cfg.get("system_prompt"))
+    if not system_prompt:
+        system_prompt = "You are a rigorous mathematical reasoning assistant."
+    user_block = build_user_block(row, data_cfg)
+    if getattr(tokenizer, "chat_template", None):
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_block},
+        ]
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    return f"System:\n{system_prompt}\n\nUser:\n{user_block}\n\nAssistant:\n"
+def extract_candidate_text(full_generation: str, prompt_text: str) -> str:
+    if full_generation.startswith(prompt_text):
+        return full_generation[len(prompt_text) :].strip()
+    return full_generation.strip()
+def is_match(candidate: str, expected_values: Sequence[str]) -> bool:
+    cand_norm = normalize_answer(candidate)
+    if not cand_norm:
+        return False
+    for expected in expected_values:
+        exp_norm = normalize_answer(expected)
+        if not exp_norm:
+            continue
+        if exp_norm in cand_norm or cand_norm in exp_norm:
+            return True
+        boxed = re.findall(r"\\boxed\{([^{}]+)\}", cand_norm)
+        if boxed and any(exp_norm in item for item in boxed):
+            return True
+    return False
+def load_model_and_tokenizer(
+    base_model: str,
+    adapter_path: Optional[Path],
+    trust_remote_code: bool,
+) -> tuple[Any, AutoTokenizer]:
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code, use_fast=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None,
+        trust_remote_code=trust_remote_code,
+    )
+    if adapter_path is not None:
+        model = PeftModel.from_pretrained(model, str(adapter_path))
+    model.eval()
+    return model, tokenizer
+def main() -> None:
+    args = parse_args()
+    cfg = load_config(args.config)
+    data_cfg = cfg.get("data", {})
+    model_cfg = cfg.get("model", {})
+    set_seed(args.seed)
+    base_model = args.base_model or as_text(model_cfg.get("base_model"))
+    if not base_model:
+        raise ValueError("Base model is required via --base-model or config.model.base_model.")
+    model, tokenizer = load_model_and_tokenizer(
+        base_model=base_model,
+        adapter_path=args.adapter_path,
+        trust_remote_code=bool(model_cfg.get("trust_remote_code", False)),
+    )
+    if not args.eval_file.exists():
+        raise FileNotFoundError(f"Evaluation file not found: {args.eval_file}")
+    ds = load_dataset("parquet", data_files={"eval": str(args.eval_file)})["eval"]
+    if args.max_samples > 0 and args.max_samples < len(ds):
+        ds = ds.select(range(args.max_samples))
+    total = 0
+    hit_at_1 = 0
+    hit_at_k = 0
+    records = []
+    for row in ds:
+        expected_values = flatten_expected(row, data_cfg)
+        if not expected_values:
+            continue
+        prompt_text = build_prompt_text(row, tokenizer, data_cfg)
+        inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=4096)
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_return_sequences=args.k,
+                max_new_tokens=args.max_new_tokens,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        generations = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        candidates = [extract_candidate_text(text, prompt_text) for text in generations]
+        matches = [is_match(candidate, expected_values) for candidate in candidates]
+        total += 1
+        if matches and matches[0]:
+            hit_at_1 += 1
+        if any(matches):
+            hit_at_k += 1
+        records.append(
+            {
+                "uid": as_text(row.get("uid")),
+                "prompt": as_text(row.get(as_text(data_cfg.get("prompt_field")) or "prompt")),
+                "expected_values": expected_values[:5],
+                "candidates": candidates,
+                "matches": matches,
+            }
+        )
+    pass_at_1 = (hit_at_1 / total) if total else 0.0
+    pass_at_k = (hit_at_k / total) if total else 0.0
+    report = {
+        "base_model": base_model,
+        "adapter_path": str(args.adapter_path) if args.adapter_path is not None else None,
+        "eval_file": str(args.eval_file),
+        "evaluated_rows": total,
+        "k": args.k,
+        "pass_at_1": pass_at_1,
+        "pass_at_k": pass_at_k,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "max_new_tokens": args.max_new_tokens,
+        "samples": records[:30],
+    }
+    args.output_json.parent.mkdir(parents=True, exist_ok=True)
+    args.output_json.write_text(json.dumps(report, ensure_ascii=True, indent=2), encoding="utf-8")
+    print(json.dumps({k: report[k] for k in ("evaluated_rows", "pass_at_1", "pass_at_k", "k")}, indent=2))
+    print(f"Saved report to {args.output_json}")
+if __name__ == "__main__":
+    main()

scripts/train_sota.py ADDED Viewed

	@@ -0,0 +1,688 @@

+#!/usr/bin/env python3
+"""Multi-stage curriculum SFT for advancing the conjecture math model."""
+from __future__ import annotations
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+import torch
+import yaml
+from datasets import Dataset, DatasetDict, load_dataset
+from huggingface_hub import HfApi
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from torch.utils.data import WeightedRandomSampler
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorForSeq2Seq,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+DEFAULT_CONFIG_PATH = Path("model_development/configs/deepseek_math_sota.yaml")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Train DeepSeek-Math with a multi-stage SOTA curriculum recipe."
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=DEFAULT_CONFIG_PATH,
+        help="Path to multi-stage YAML config.",
+    )
+    parser.add_argument("--repo-id", type=str, default=None, help="Override hub.repo_id.")
+    parser.add_argument("--push-to-hub", action="store_true", help="Force push enabled.")
+    parser.add_argument("--no-push-to-hub", action="store_true", help="Force push disabled.")
+    parser.add_argument(
+        "--start-stage",
+        type=int,
+        default=1,
+        help="1-based stage index to start from.",
+    )
+    parser.add_argument(
+        "--max-stages",
+        type=int,
+        default=None,
+        help="Optional number of stages to run from --start-stage.",
+    )
+    parser.add_argument(
+        "--credentials-path",
+        type=Path,
+        default=None,
+        help="Override credentials.path.",
+    )
+    return parser.parse_args()
+def as_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value.strip()
+    return str(value).strip()
+def as_float(value: Any, default: float) -> float:
+    if value is None:
+        return default
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+def as_int(value: Any, default: int) -> int:
+    if value is None:
+        return default
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+def load_config(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    cfg = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if not isinstance(cfg, dict):
+        raise ValueError(f"Invalid config format: {path}")
+    for key in ("model", "data", "stages"):
+        if key not in cfg:
+            raise ValueError(f"Missing config section: {key}")
+    if not isinstance(cfg["stages"], list) or not cfg["stages"]:
+        raise ValueError("Config must contain at least one stage in stages[].")
+    cfg.setdefault("global", {})
+    cfg.setdefault("training_defaults", {})
+    cfg.setdefault("hub", {})
+    cfg.setdefault("credentials", {})
+    return cfg
+def apply_overrides(cfg: Dict[str, Any], args: argparse.Namespace) -> None:
+    if args.repo_id:
+        cfg.setdefault("hub", {})["repo_id"] = args.repo_id
+    if args.credentials_path is not None:
+        cfg.setdefault("credentials", {})["path"] = str(args.credentials_path)
+    if args.push_to_hub and args.no_push_to_hub:
+        raise ValueError("Cannot set both --push-to-hub and --no-push-to-hub.")
+    if args.push_to_hub:
+        cfg.setdefault("hub", {})["push_to_hub"] = True
+    if args.no_push_to_hub:
+        cfg.setdefault("hub", {})["push_to_hub"] = False
+def resolve_auth(cfg: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
+    token = as_text(os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")) or None
+    username = as_text(os.environ.get("HF_USERNAME")) or None
+    cred_path = as_text(cfg.get("credentials", {}).get("path"))
+    if cred_path:
+        path = Path(cred_path)
+        if path.exists():
+            data = json.loads(path.read_text(encoding="utf-8"))
+            if token is None:
+                token = as_text(data.get("key")) or None
+            if username is None:
+                username = as_text(data.get("username")) or None
+    return token, username
+def resolve_repo_id(cfg: Dict[str, Any], username: Optional[str], output_root: Path) -> Optional[str]:
+    repo_id = as_text(cfg.get("hub", {}).get("repo_id"))
+    if repo_id:
+        return repo_id
+    if not username:
+        return None
+    return f"{username}/{output_root.name}"
+def stringify_structured(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        text = value.strip()
+        if not text:
+            return ""
+        try:
+            parsed = json.loads(text)
+        except json.JSONDecodeError:
+            return text
+        return json.dumps(parsed, ensure_ascii=False, sort_keys=True)
+    return json.dumps(value, ensure_ascii=False, sort_keys=True)
+def build_user_block(row: Dict[str, Any], data_cfg: Dict[str, Any]) -> str:
+    prompt_field = as_text(data_cfg.get("prompt_field")) or "prompt"
+    prompt = as_text(row.get(prompt_field))
+    if not prompt:
+        prompt = "Solve the math task."
+    meta_fields = [
+        ("task_type", "Task type"),
+        ("family", "Family"),
+        ("difficulty", "Difficulty"),
+        ("source_dataset", "Source"),
+        ("status_as_of", "Status as of"),
+    ]
+    meta_lines = []
+    for key, label in meta_fields:
+        value = as_text(row.get(key))
+        if value:
+            meta_lines.append(f"{label}: {value}")
+    tags = row.get("topic_tags")
+    if isinstance(tags, list) and tags:
+        tag_text = ", ".join(as_text(tag) for tag in tags if as_text(tag))
+        if tag_text:
+            meta_lines.append(f"Tags: {tag_text}")
+    if not meta_lines:
+        return prompt
+    return f"{prompt}\n\nMetadata:\n" + "\n".join(meta_lines)
+def build_answer_block(row: Dict[str, Any], data_cfg: Dict[str, Any]) -> str:
+    target_field = as_text(data_cfg.get("target_field")) or "target"
+    final_answer_field = as_text(data_cfg.get("final_answer_field")) or "final_answer"
+    proof_field = as_text(data_cfg.get("proof_field")) or "proof_formal"
+    sections = []
+    target_text = stringify_structured(row.get(target_field))
+    if target_text:
+        sections.append(f"Structured target:\n{target_text}")
+    final_answer = stringify_structured(row.get(final_answer_field))
+    if final_answer:
+        sections.append(f"Final answer:\n{final_answer}")
+    proof_text = stringify_structured(row.get(proof_field))
+    if proof_text:
+        sections.append(f"Formal proof snippet:\n{proof_text}")
+    if not sections:
+        sections.append("No structured target provided.")
+    return "\n\n".join(sections).strip()
+def build_prompt_text(row: Dict[str, Any], tokenizer: AutoTokenizer, data_cfg: Dict[str, Any]) -> str:
+    system_prompt = as_text(data_cfg.get("system_prompt"))
+    if not system_prompt:
+        system_prompt = (
+            "You are a rigorous mathematical reasoning assistant focused on unsolved "
+            "conjectures. Produce checkable reasoning."
+        )
+    user_block = build_user_block(row, data_cfg)
+    if getattr(tokenizer, "chat_template", None):
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_block},
+        ]
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    return f"System:\n{system_prompt}\n\nUser:\n{user_block}\n\nAssistant:\n"
+def compute_loss_weight(row: Dict[str, Any], data_cfg: Dict[str, Any]) -> float:
+    sample_weight_field = as_text(data_cfg.get("sample_weight_field")) or "sample_weight"
+    base = as_float(row.get(sample_weight_field), 1.0)
+    family = as_text(row.get("family"))
+    family_boost = data_cfg.get("family_boost", {})
+    if isinstance(family_boost, dict):
+        base *= as_float(family_boost.get(family), 1.0)
+    min_w = as_float(data_cfg.get("min_loss_weight"), 0.1)
+    max_w = as_float(data_cfg.get("max_loss_weight"), 8.0)
+    if min_w > max_w:
+        min_w, max_w = max_w, min_w
+    return max(min_w, min(max_w, base))
+def stage_split_files(stage_cfg: Dict[str, Any], data_cfg: Dict[str, Any]) -> Dict[str, str]:
+    train_file = as_text(stage_cfg.get("train_file")) or as_text(data_cfg.get("default_train_file"))
+    valid_file = as_text(stage_cfg.get("validation_file")) or as_text(data_cfg.get("default_validation_file"))
+    train_path = Path(train_file)
+    valid_path = Path(valid_file)
+    if not train_path.exists():
+        raise FileNotFoundError(f"Missing train split for stage: {train_path}")
+    if not valid_path.exists():
+        raise FileNotFoundError(f"Missing validation split for stage: {valid_path}")
+    return {"train": str(train_path), "validation": str(valid_path)}
+def apply_filters(dataset: Dataset, filter_cfg: Dict[str, Any]) -> Dataset:
+    if not filter_cfg:
+        return dataset
+    include_families = set(filter_cfg.get("include_families", []) or [])
+    exclude_families = set(filter_cfg.get("exclude_families", []) or [])
+    include_task_types = set(filter_cfg.get("include_task_types", []) or [])
+    source_datasets = set(filter_cfg.get("source_datasets", []) or [])
+    require_conjecture_id = bool(filter_cfg.get("require_conjecture_id", False))
+    min_sample_weight = filter_cfg.get("min_sample_weight")
+    min_sample_weight = as_float(min_sample_weight, 0.0) if min_sample_weight is not None else None
+    def _keep(row: Dict[str, Any]) -> bool:
+        family = as_text(row.get("family"))
+        if include_families and family not in include_families:
+            return False
+        if exclude_families and family in exclude_families:
+            return False
+        if include_task_types:
+            task_type = as_text(row.get("task_type"))
+            if task_type not in include_task_types:
+                return False
+        if source_datasets:
+            source = as_text(row.get("source_dataset"))
+            if source not in source_datasets:
+                return False
+        if require_conjecture_id:
+            conjecture_id = as_text(row.get("conjecture_id"))
+            if not conjecture_id or conjecture_id.lower() == "null":
+                return False
+        if min_sample_weight is not None:
+            sample_weight = as_float(row.get("sample_weight"), 0.0)
+            if sample_weight < min_sample_weight:
+                return False
+        return True
+    return dataset.filter(_keep, desc="Applying stage filters")
+def maybe_select(dataset: Dataset, max_samples: Optional[int]) -> Dataset:
+    if max_samples is None:
+        return dataset
+    if max_samples <= 0:
+        raise ValueError("max_samples must be positive.")
+    if max_samples >= len(dataset):
+        return dataset
+    return dataset.select(range(max_samples))
+def tokenize_datasets(raw: DatasetDict, tokenizer: AutoTokenizer, data_cfg: Dict[str, Any]) -> DatasetDict:
+    max_len = as_int(data_cfg.get("max_seq_length"), 2048)
+    if max_len < 64:
+        raise ValueError("data.max_seq_length must be >= 64")
+    eos = tokenizer.eos_token or ""
+    remove_columns = raw["train"].column_names
+    def _tokenize(row: Dict[str, Any]) -> Dict[str, Any]:
+        prompt_text = build_prompt_text(row, tokenizer, data_cfg)
+        answer_text = build_answer_block(row, data_cfg)
+        full_text = f"{prompt_text}{answer_text}{eos}"
+        prompt_ids = tokenizer(prompt_text, add_special_tokens=False)["input_ids"]
+        full_enc = tokenizer(
+            full_text,
+            add_special_tokens=False,
+            truncation=True,
+            max_length=max_len,
+        )
+        input_ids = full_enc["input_ids"]
+        attention_mask = full_enc["attention_mask"]
+        if not input_ids:
+            fallback = tokenizer.eos_token_id
+            if fallback is None:
+                fallback = tokenizer.pad_token_id
+            if fallback is None:
+                fallback = 0
+            input_ids = [fallback]
+            attention_mask = [1]
+            labels = [fallback]
+        else:
+            prompt_len = min(len(prompt_ids), len(input_ids))
+            labels = [-100] * prompt_len + input_ids[prompt_len:]
+            if prompt_len >= len(input_ids):
+                labels[-1] = input_ids[-1]
+        loss_weight = compute_loss_weight(row, data_cfg)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "loss_weight": float(loss_weight),
+        }
+    tokenized = raw.map(
+        _tokenize,
+        remove_columns=remove_columns,
+        desc="Tokenizing prompt/answer pairs",
+    )
+    tokenized = tokenized.filter(
+        lambda row: any(token != -100 for token in row["labels"]),
+        desc="Dropping prompt-only rows",
+    )
+    return tokenized
+def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict[str, Any]) -> Tuple[Any, AutoTokenizer]:
+    base_model = as_text(model_cfg.get("base_model"))
+    if not base_model:
+        raise ValueError("model.base_model is required.")
+    use_bf16 = bool(model_cfg.get("use_bf16", True))
+    dtype = torch.bfloat16 if use_bf16 else torch.float16
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model,
+        trust_remote_code=bool(model_cfg.get("trust_remote_code", False)),
+        use_fast=True,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+    model_kwargs: Dict[str, Any] = {
+        "trust_remote_code": bool(model_cfg.get("trust_remote_code", False)),
+        "torch_dtype": dtype,
+    }
+    attn_impl = as_text(model_cfg.get("attn_implementation"))
+    if attn_impl:
+        model_kwargs["attn_implementation"] = attn_impl
+    load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
+    if load_in_4bit:
+        if not torch.cuda.is_available():
+            raise RuntimeError("4-bit loading requested but CUDA is not available.")
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
+            bnb_4bit_use_double_quant=bool(model_cfg.get("bnb_4bit_use_double_quant", True)),
+            bnb_4bit_compute_dtype=dtype,
+        )
+        model_kwargs["device_map"] = "auto"
+    model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
+    if tokenizer.pad_token_id is not None:
+        model.config.pad_token_id = tokenizer.pad_token_id
+    model.config.use_cache = False
+    if load_in_4bit:
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=bool(training_defaults.get("gradient_checkpointing", True)),
+        )
+    lora_cfg = model_cfg.get("lora", {})
+    peft_cfg = LoraConfig(
+        r=as_int(lora_cfg.get("r"), 64),
+        lora_alpha=as_int(lora_cfg.get("alpha"), 128),
+        lora_dropout=as_float(lora_cfg.get("dropout"), 0.05),
+        bias=as_text(lora_cfg.get("bias")) or "none",
+        task_type="CAUSAL_LM",
+        target_modules=lora_cfg.get("target_modules"),
+    )
+    model = get_peft_model(model, peft_cfg)
+    model.print_trainable_parameters()
+    return model, tokenizer
+class WeightedLossCollator:
+    def __init__(self, tokenizer: AutoTokenizer, model: Any) -> None:
+        self.base = DataCollatorForSeq2Seq(
+            tokenizer=tokenizer,
+            model=model,
+            label_pad_token_id=-100,
+            pad_to_multiple_of=8,
+        )
+    def __call__(self, features: list[Dict[str, Any]]) -> Dict[str, Any]:
+        weights = [float(feature.pop("loss_weight", 1.0)) for feature in features]
+        batch = self.base(features)
+        batch["loss_weight"] = torch.tensor(weights, dtype=torch.float32)
+        return batch
+class WeightedLossTrainer(Trainer):
+    def _get_train_sampler(self):
+        if self.train_dataset is None:
+            return None
+        if "loss_weight" not in self.train_dataset.column_names:
+            return super()._get_train_sampler()
+        weights = self.train_dataset["loss_weight"]
+        if not weights:
+            return super()._get_train_sampler()
+        weight_tensor = torch.tensor(weights, dtype=torch.double)
+        return WeightedRandomSampler(
+            weights=weight_tensor,
+            num_samples=len(weight_tensor),
+            replacement=True,
+        )
+    def compute_loss(
+        self,
+        model: Any,
+        inputs: Dict[str, Any],
+        return_outputs: bool = False,
+        num_items_in_batch: Optional[torch.Tensor] = None,
+    ):
+        loss_weight = inputs.pop("loss_weight", None)
+        labels = inputs.get("labels")
+        if labels is None:
+            return super().compute_loss(
+                model=model,
+                inputs=inputs,
+                return_outputs=return_outputs,
+                num_items_in_batch=num_items_in_batch,
+            )
+        model_inputs = {k: v for k, v in inputs.items() if k != "labels"}
+        outputs = model(**model_inputs)
+        logits = outputs.logits
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        token_losses = torch.nn.functional.cross_entropy(
+            shift_logits.view(-1, shift_logits.size(-1)),
+            shift_labels.view(-1),
+            ignore_index=-100,
+            reduction="none",
+        ).view(shift_labels.size())
+        token_mask = shift_labels.ne(-100).float()
+        seq_den = token_mask.sum(dim=1).clamp(min=1.0)
+        seq_loss = (token_losses * token_mask).sum(dim=1) / seq_den
+        if loss_weight is not None:
+            normalized = loss_weight.to(seq_loss.device).float().clamp(min=0.05)
+            loss = (seq_loss * normalized).sum() / normalized.sum()
+        else:
+            loss = seq_loss.mean()
+        if return_outputs:
+            return loss, outputs
+        return loss
+def build_training_args(
+    output_dir: Path,
+    training_cfg: Dict[str, Any],
+    use_bf16: bool,
+    has_eval_split: bool,
+) -> TrainingArguments:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    return TrainingArguments(
+        output_dir=str(output_dir),
+        num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
+        per_device_train_batch_size=as_int(training_cfg.get("per_device_train_batch_size"), 1),
+        per_device_eval_batch_size=as_int(training_cfg.get("per_device_eval_batch_size"), 1),
+        gradient_accumulation_steps=as_int(training_cfg.get("gradient_accumulation_steps"), 1),
+        learning_rate=as_float(training_cfg.get("learning_rate"), 2e-5),
+        weight_decay=as_float(training_cfg.get("weight_decay"), 0.0),
+        warmup_ratio=as_float(training_cfg.get("warmup_ratio"), 0.0),
+        lr_scheduler_type=as_text(training_cfg.get("lr_scheduler_type")) or "cosine",
+        max_grad_norm=as_float(training_cfg.get("max_grad_norm"), 1.0),
+        gradient_checkpointing=bool(training_cfg.get("gradient_checkpointing", True)),
+        logging_steps=as_int(training_cfg.get("logging_steps"), 10),
+        save_steps=as_int(training_cfg.get("save_steps"), 500),
+        save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
+        dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
+        seed=as_int(training_cfg.get("seed"), 17),
+        bf16=use_bf16,
+        fp16=not use_bf16,
+        remove_unused_columns=False,
+        report_to="none",
+        evaluation_strategy="steps" if has_eval_split else "no",
+        eval_steps=as_int(training_cfg.get("eval_steps"), 500) if has_eval_split else None,
+    )
+def push_folder(
+    api: HfApi,
+    repo_id: str,
+    folder_path: Path,
+    commit_message: str,
+    path_in_repo: Optional[str] = None,
+) -> None:
+    kwargs: Dict[str, Any] = {
+        "repo_id": repo_id,
+        "repo_type": "model",
+        "folder_path": str(folder_path),
+        "commit_message": commit_message,
+    }
+    if path_in_repo:
+        kwargs["path_in_repo"] = path_in_repo
+    api.upload_folder(**kwargs)
+def main() -> None:
+    args = parse_args()
+    cfg = load_config(args.config)
+    apply_overrides(cfg, args)
+    seed = as_int(cfg.get("global", {}).get("seed"), 17)
+    set_seed(seed)
+    output_root = Path(as_text(cfg.get("global", {}).get("output_root")) or "model_development/runs/math-conjecture-sota")
+    output_root.mkdir(parents=True, exist_ok=True)
+    token, username = resolve_auth(cfg)
+    repo_id = resolve_repo_id(cfg, username=username, output_root=output_root)
+    push_to_hub = bool(cfg.get("hub", {}).get("push_to_hub", False))
+    if push_to_hub:
+        if token is None:
+            raise ValueError("Hub push requested but token is missing.")
+        if repo_id is None:
+            raise ValueError("Hub push requested but repo_id is missing.")
+    model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
+    data_cfg = cfg["data"]
+    stage_reports = []
+    start_stage = max(1, args.start_stage)
+    stages = cfg["stages"]
+    end_stage = len(stages)
+    if args.max_stages is not None:
+        if args.max_stages <= 0:
+            raise ValueError("--max-stages must be positive.")
+        end_stage = min(end_stage, start_stage + args.max_stages - 1)
+    for index in range(start_stage, end_stage + 1):
+        stage = stages[index - 1]
+        stage_name = as_text(stage.get("name")) or f"stage_{index:02d}"
+        stage_slug = f"{index:02d}_{stage_name.replace(' ', '_')}"
+        stage_output_dir = output_root / stage_slug
+        split_files = stage_split_files(stage, data_cfg)
+        raw = load_dataset("parquet", data_files=split_files)
+        filters = stage.get("filters", {})
+        raw["train"] = apply_filters(raw["train"], filters)
+        raw["validation"] = apply_filters(raw["validation"], filters)
+        raw["train"] = maybe_select(raw["train"], stage.get("max_train_samples"))
+        raw["validation"] = maybe_select(raw["validation"], stage.get("max_eval_samples"))
+        if len(raw["train"]) == 0:
+            raise ValueError(f"Stage {stage_slug} has zero train rows after filtering.")
+        tokenized = tokenize_datasets(raw, tokenizer, data_cfg)
+        train_dataset = tokenized["train"]
+        eval_dataset = tokenized["validation"] if len(tokenized["validation"]) > 0 else None
+        merged_training = dict(cfg.get("training_defaults", {}))
+        merged_training.update(stage.get("training", {}))
+        merged_training["seed"] = seed
+        training_args = build_training_args(
+            output_dir=stage_output_dir,
+            training_cfg=merged_training,
+            use_bf16=bool(cfg["model"].get("use_bf16", True)),
+            has_eval_split=eval_dataset is not None,
+        )
+        collator = WeightedLossCollator(tokenizer=tokenizer, model=model)
+        trainer = WeightedLossTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            data_collator=collator,
+        )
+        train_result = trainer.train()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if eval_dataset is not None:
+            eval_metrics = trainer.evaluate()
+            trainer.log_metrics("eval", eval_metrics)
+            trainer.save_metrics("eval", eval_metrics)
+        trainer.save_model(str(stage_output_dir))
+        tokenizer.save_pretrained(str(stage_output_dir))
+        report = {
+            "stage_index": index,
+            "stage_name": stage_name,
+            "output_dir": str(stage_output_dir),
+            "train_rows": len(train_dataset),
+            "eval_rows": len(eval_dataset) if eval_dataset is not None else 0,
+            "train_metrics": train_result.metrics,
+        }
+        stage_reports.append(report)
+    final_dir = output_root / "final_adapter"
+    final_dir.mkdir(parents=True, exist_ok=True)
+    model.save_pretrained(str(final_dir))
+    tokenizer.save_pretrained(str(final_dir))
+    summary = {
+        "config_path": str(args.config),
+        "repo_id": repo_id,
+        "seed": seed,
+        "stages_ran": stage_reports,
+        "final_adapter_dir": str(final_dir),
+    }
+    summary_path = output_root / "training_summary.json"
+    summary_path.write_text(json.dumps(summary, ensure_ascii=True, indent=2), encoding="utf-8")
+    if push_to_hub and repo_id is not None and token is not None:
+        api = HfApi(token=token)
+        api.create_repo(
+            repo_id=repo_id,
+            repo_type="model",
+            private=bool(cfg.get("hub", {}).get("private", False)),
+            exist_ok=True,
+        )
+        commit_message = as_text(cfg.get("hub", {}).get("commit_message")) or "Upload SOTA curriculum adapter."
+        push_folder(api, repo_id, final_dir, commit_message=commit_message)
+        if bool(cfg.get("hub", {}).get("upload_stage_checkpoints", False)):
+            for report in stage_reports:
+                stage_dir = Path(report["output_dir"])
+                path_in_repo = f"checkpoints/{Path(report['output_dir']).name}"
+                push_folder(
+                    api,
+                    repo_id,
+                    stage_dir,
+                    commit_message=f"Upload stage checkpoint {report['stage_name']}",
+                    path_in_repo=path_in_repo,
+                )
+        api.upload_file(
+            path_or_fileobj=str(summary_path),
+            path_in_repo="training_summary.json",
+            repo_id=repo_id,
+            repo_type="model",
+            commit_message="Upload training summary for SOTA curriculum run.",
+        )
+        print(f"Pushed training artifacts to https://huggingface.co/{repo_id}")
+    print(f"Training complete. Final adapter: {final_dir}")
+    print(f"Training summary: {summary_path}")
+if __name__ == "__main__":
+    main()