Spaces:

S-Dreamer
/

CodeCraftLab

Runtime error

App Files Files Community

S-Dreamer commited on May 16

Commit

b9ed97d

verified ·

1 Parent(s): 178abc4

Upload 4 files

Browse files

Files changed (4) hide show

evaluators.py +229 -0
hf-sync.yml +151 -0
logging.py +55 -0
pipeline.py +464 -0

evaluators.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Evaluator implementations for code generation metrics.
+Each evaluator exposes a single method:
+    evaluate(model, tokenizer, dataset) -> float
+Scores are always in [0, 1].
+"""
+from __future__ import annotations
+import ast
+import multiprocessing
+import textwrap
+from abc import ABC, abstractmethod
+from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
+from typing import Any
+import numpy as np
+import torch
+from datasets import Dataset
+from sacrebleu.metrics import BLEU
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+# ---------------------------------------------------------------------------
+# Base
+# ---------------------------------------------------------------------------
+class BaseEvaluator(ABC):
+    @abstractmethod
+    def evaluate(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        dataset: Dataset,
+    ) -> float:
+        ...
+    def _generate_batch(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        prompts: list[str],
+        max_new_tokens: int = 256,
+        num_return_sequences: int = 1,
+        temperature: float = 0.2,
+    ) -> list[list[str]]:
+        """Generate completions for a list of prompts. Returns list-of-lists."""
+        results: list[list[str]] = []
+        device = next(model.parameters()).device
+        for prompt in prompts:
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    num_return_sequences=num_return_sequences,
+                    do_sample=temperature > 0,
+                    temperature=temperature if temperature > 0 else 1.0,
+                    top_p=0.95,
+                    pad_token_id=tokenizer.eos_token_id,
+                )
+            prompt_len = inputs["input_ids"].shape[1]
+            completions = [
+                tokenizer.decode(out[prompt_len:], skip_special_tokens=True)
+                for out in outputs
+            ]
+            results.append(completions)
+        return results
+# ---------------------------------------------------------------------------
+# Pass@k
+# ---------------------------------------------------------------------------
+class PassAtKEvaluator(BaseEvaluator):
+    """
+    Unbiased pass@k estimator from Chen et al. (2021):
+        pass@k = 1 - C(n-c, k) / C(n, k)
+    where n = total samples, c = correct samples.
+    """
+    def __init__(self, k: int = 1, n: int = 10) -> None:
+        self.k = k
+        self.n = n
+    def evaluate(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        dataset: Dataset,
+        num_problems: int = 50,
+    ) -> float:
+        problems = dataset.select(range(min(num_problems, len(dataset))))
+        prompts = [str(ex.get("prompt", ex.get("content", ""))) for ex in problems]
+        references = [str(ex.get("canonical_solution", ex.get("content", ""))) for ex in problems]
+        all_completions = self._generate_batch(
+            model, tokenizer, prompts,
+            num_return_sequences=self.n,
+            temperature=0.8,  # diversity for pass@k
+        )
+        scores: list[float] = []
+        for completions, reference in zip(all_completions, references):
+            correct = sum(
+                1 for c in completions
+                if self._is_correct(c, reference)
+            )
+            scores.append(self._pass_at_k(n=self.n, c=correct, k=self.k))
+        return float(np.mean(scores))
+    @staticmethod
+    def _pass_at_k(n: int, c: int, k: int) -> float:
+        if n - c < k:
+            return 1.0
+        return 1.0 - float(np.prod([(n - c - i) / (n - i) for i in range(k)]))
+    @staticmethod
+    def _is_correct(completion: str, reference: str) -> bool:
+        # Basic syntactic check — override with execution check for HumanEval-style
+        try:
+            ast.parse(completion)
+            return completion.strip() == reference.strip()
+        except SyntaxError:
+            return False
+# ---------------------------------------------------------------------------
+# BLEU
+# ---------------------------------------------------------------------------
+class BleuEvaluator(BaseEvaluator):
+    def __init__(self, max_new_tokens: int = 256) -> None:
+        self._max_new_tokens = max_new_tokens
+        self._bleu = BLEU(effective_order=True)
+    def evaluate(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        dataset: Dataset,
+        num_samples: int = 100,
+    ) -> float:
+        subset = dataset.select(range(min(num_samples, len(dataset))))
+        prompts = [str(ex.get("prompt", ex.get("content", ""))) for ex in subset]
+        references = [str(ex.get("canonical_solution", ex.get("content", ""))) for ex in subset]
+        completions_batch = self._generate_batch(
+            model, tokenizer, prompts, max_new_tokens=self._max_new_tokens
+        )
+        hypotheses = [batch[0] for batch in completions_batch]
+        result = self._bleu.corpus_score(hypotheses, [references])
+        # sacrebleu returns score in [0, 100]; normalise to [0, 1]
+        return result.score / 100.0
+# ---------------------------------------------------------------------------
+# Execution accuracy
+# ---------------------------------------------------------------------------
+def _run_code_safe(code: str, timeout: int) -> bool:
+    """Run in a subprocess to enforce timeout and isolate crashes."""
+    try:
+        exec(compile(code, "<string>", "exec"), {})  # noqa: S102
+        return True
+    except Exception:
+        return False
+class ExecutionAccuracyEvaluator(BaseEvaluator):
+    """Fraction of generated code snippets that execute without error."""
+    def __init__(self, timeout: int = 10, max_new_tokens: int = 256) -> None:
+        self._timeout = timeout
+        self._max_new_tokens = max_new_tokens
+    def evaluate(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        dataset: Dataset,
+        num_samples: int = 50,
+    ) -> float:
+        subset = dataset.select(range(min(num_samples, len(dataset))))
+        prompts = [str(ex.get("prompt", ex.get("content", ""))) for ex in subset]
+        completions_batch = self._generate_batch(
+            model, tokenizer, prompts, max_new_tokens=self._max_new_tokens
+        )
+        codes = [batch[0] for batch in completions_batch]
+        passed = 0
+        with ProcessPoolExecutor(max_workers=4) as executor:
+            futures = {executor.submit(_run_code_safe, code, self._timeout): code for code in codes}
+            for future in futures:
+                try:
+                    if future.result(timeout=self._timeout + 1):
+                        passed += 1
+                except (FuturesTimeoutError, Exception):
+                    pass
+        return passed / len(codes) if codes else 0.0
+# ---------------------------------------------------------------------------
+# Exact match
+# ---------------------------------------------------------------------------
+class ExactMatchEvaluator(BaseEvaluator):
+    def evaluate(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        dataset: Dataset,
+        num_samples: int = 100,
+    ) -> float:
+        subset = dataset.select(range(min(num_samples, len(dataset))))
+        prompts = [str(ex.get("prompt", ex.get("content", ""))) for ex in subset]
+        references = [str(ex.get("canonical_solution", ex.get("content", ""))) for ex in subset]
+        completions_batch = self._generate_batch(model, tokenizer, prompts)
+        hypotheses = [batch[0].strip() for batch in completions_batch]
+        matches = sum(h == r.strip() for h, r in zip(hypotheses, references))
+        return matches / len(references) if references else 0.0

hf-sync.yml ADDED Viewed

	@@ -0,0 +1,151 @@

+name: HF ↔ GitHub Sync
+# Trigger on every push to main (GitHub → HF direction)
+# and hourly to pull any HF-side changes back (HF → GitHub direction)
+on:
+  push:
+    branches: [main]
+  schedule:
+    - cron: '0 * * * *'   # hourly HF pull-check
+  workflow_dispatch:
+    inputs:
+      force_direction:
+        description: 'Force sync direction (hf-wins | gh-wins | auto)'
+        required: false
+        default: 'auto'
+env:
+  HF_REPO_TYPE: space          # model | dataset | space
+  HF_REPO: ${{ vars.HF_REPO }} # e.g. your-org/codecraftlab
+jobs:
+  sync:
+    name: Sync HuggingFace ↔ GitHub
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout (full history)
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Configure git identity
+        run: |
+          git config user.email "sync-bot@codecraftlab.noreply"
+          git config user.name "CodeCraftLab Sync Bot"
+      - name: Install git-lfs
+        run: |
+          sudo apt-get install -y git-lfs
+          git lfs install
+      - name: Add HuggingFace remote
+        run: |
+          git remote add hf \
+            "https://user:${{ secrets.HF_TOKEN }}@huggingface.co/${HF_REPO_TYPE}s/${HF_REPO}"
+          git fetch hf --prune
+      - name: Detect divergence and resolve
+        id: sync
+        env:
+          FORCE_DIRECTION: ${{ github.event.inputs.force_direction || 'auto' }}
+        run: |
+          set -euo pipefail
+          HF_HEAD=$(git rev-parse hf/main 2>/dev/null || echo "NONE")
+          GH_HEAD=$(git rev-parse HEAD)
+          if [ "$HF_HEAD" = "NONE" ]; then
+            echo "action=push-to-hf" >> "$GITHUB_OUTPUT"
+            echo "reason=HF remote has no main branch — initial push"
+            exit 0
+          fi
+          BASE=$(git merge-base HEAD hf/main 2>/dev/null || echo "NONE")
+          if [ "$FORCE_DIRECTION" = "hf-wins" ]; then
+            echo "action=hf-wins" >> "$GITHUB_OUTPUT"
+            echo "reason=Forced HF-wins override"
+          elif [ "$FORCE_DIRECTION" = "gh-wins" ]; then
+            echo "action=push-to-hf" >> "$GITHUB_OUTPUT"
+            echo "reason=Forced GH-wins override"
+          elif [ "$HF_HEAD" = "$GH_HEAD" ]; then
+            echo "action=in-sync" >> "$GITHUB_OUTPUT"
+            echo "reason=Already in sync"
+          elif [ "$BASE" = "$GH_HEAD" ]; then
+            # HF is ahead — pull HF → GitHub
+            echo "action=hf-wins" >> "$GITHUB_OUTPUT"
+            echo "reason=GitHub is behind HF — fast-forward"
+          elif [ "$BASE" = "$HF_HEAD" ]; then
+            # GitHub is ahead — push GitHub → HF
+            echo "action=push-to-hf" >> "$GITHUB_OUTPUT"
+            echo "reason=HF is behind GitHub — pushing"
+          else
+            # Both diverged — HF is source of truth
+            echo "action=hf-wins" >> "$GITHUB_OUTPUT"
+            echo "reason=CONFLICT: both diverged — HF wins (source of truth)"
+          fi
+      - name: "[In-sync] Nothing to do"
+        if: steps.sync.outputs.action == 'in-sync'
+        run: echo "✅ HF and GitHub are in sync — no action required."
+      - name: "[Push] GitHub → HuggingFace"
+        if: steps.sync.outputs.action == 'push-to-hf'
+        run: |
+          echo "📤 Pushing GitHub → HuggingFace"
+          git push hf main
+      - name: "[HF Wins] HuggingFace → GitHub"
+        if: steps.sync.outputs.action == 'hf-wins'
+        run: |
+          echo "📥 HuggingFace wins — overwriting GitHub main"
+          git reset --hard hf/main
+          git push origin main --force-with-lease || git push origin main --force
+      - name: Summary
+        if: always()
+        run: |
+          echo "### Sync Result" >> "$GITHUB_STEP_SUMMARY"
+          echo "- **Action:** ${{ steps.sync.outputs.action }}" >> "$GITHUB_STEP_SUMMARY"
+          echo "- **Trigger:** ${{ github.event_name }}" >> "$GITHUB_STEP_SUMMARY"
+          echo "- **Branch:** main" >> "$GITHUB_STEP_SUMMARY"
+  # ------------------------------------------------------------------
+  # Validate HF Space config on every push
+  # ------------------------------------------------------------------
+  validate-space-config:
+    name: Validate HF Space README config
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check README frontmatter
+        run: |
+          python3 - <<'EOF'
+          import re, sys
+          with open("README.md") as f:
+              content = f.read()
+          match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
+          if not match:
+              print("❌ README is missing HF Space YAML frontmatter")
+              sys.exit(1)
+          frontmatter = match.group(1)
+          required_keys = ["title", "sdk", "app_port", "license"]
+          missing = [k for k in required_keys if k + ":" not in frontmatter]
+          if missing:
+              print(f"❌ Missing frontmatter keys: {missing}")
+              sys.exit(1)
+          if "sdk: streamlit" in frontmatter:
+              print("❌ sdk is still 'streamlit' — should be 'docker' for FastAPI")
+              sys.exit(1)
+          print("✅ HF Space frontmatter is valid")
+          EOF

logging.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Structured logging setup via structlog.
+Production: JSON output, machine-parseable.
+Development: colourised console output.
+Call configure_logging() once at application startup before any loggers are created.
+"""
+from __future__ import annotations
+import logging
+import sys
+import structlog
+def configure_logging(log_level: str = "INFO", env: str = "development") -> None:
+    """Configure structlog with environment-appropriate rendering."""
+    shared_processors: list[structlog.types.Processor] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.stdlib.add_log_level,
+        structlog.stdlib.add_logger_name,
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.StackInfoRenderer(),
+    ]
+    if env == "production":
+        processors: list[structlog.types.Processor] = [
+            *shared_processors,
+            structlog.processors.dict_tracebacks,
+            structlog.processors.JSONRenderer(),
+        ]
+        renderer = structlog.processors.JSONRenderer()
+    else:
+        processors = [
+            *shared_processors,
+            structlog.dev.ConsoleRenderer(colors=True),
+        ]
+        renderer = structlog.dev.ConsoleRenderer(colors=True)
+    structlog.configure(
+        processors=processors,
+        wrapper_class=structlog.make_filtering_bound_logger(
+            getattr(logging, log_level.upper(), logging.INFO)
+        ),
+        context_class=dict,
+        logger_factory=structlog.PrintLoggerFactory(sys.stdout),
+        cache_logger_on_first_use=True,
+    )
+    # Silence noisy third-party loggers
+    for noisy in ("uvicorn.access", "httpx", "transformers", "datasets"):
+        logging.getLogger(noisy).setLevel(logging.WARNING)

pipeline.py ADDED Viewed

	@@ -0,0 +1,464 @@

+"""
+Fine-tuning pipeline with structured logging and eval hooks.
+Pipeline stages:
+  1. Preflight validation  — config, GPU, disk, token
+  2. Dataset preparation   — load, tokenize, split
+  3. Model initialisation  — base model + LoRA adapters
+  4. Training              — Trainer with custom callbacks
+  5. Evaluation            — post-training metric suite
+  6. Checkpoint export     — save + optional HF Hub push
+Each stage emits structured log events. Eval hooks are composable and
+run both during training (via TrainerCallback) and post-training.
+"""
+from __future__ import annotations
+import json
+import os
+import shutil
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import structlog
+import torch
+from datasets import Dataset, DatasetDict, load_dataset
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from training.config import EvalMetric, EvalStrategy, TrainingJobConfig
+from training.evaluators import (
+    BleuEvaluator,
+    ExecutionAccuracyEvaluator,
+    ExactMatchEvaluator,
+    PassAtKEvaluator,
+)
+log = structlog.get_logger(__name__)
+# ---------------------------------------------------------------------------
+# Eval result container
+# ---------------------------------------------------------------------------
+@dataclass
+class EvalResults:
+    job_name: str
+    epoch: float
+    step: int
+    metrics: dict[str, float] = field(default_factory=dict)
+    errors: list[str] = field(default_factory=list)
+    duration_seconds: float = 0.0
+    def log(self, bound_log: structlog.BoundLogger) -> None:
+        bound_log.info(
+            "eval.completed",
+            epoch=self.epoch,
+            step=self.step,
+            duration_seconds=round(self.duration_seconds, 2),
+            **self.metrics,
+        )
+        for error in self.errors:
+            bound_log.warning("eval.error", message=error)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "job_name": self.job_name,
+            "epoch": self.epoch,
+            "step": self.step,
+            "metrics": self.metrics,
+            "errors": self.errors,
+            "duration_seconds": self.duration_seconds,
+        }
+# ---------------------------------------------------------------------------
+# Eval hook registry
+# ---------------------------------------------------------------------------
+class EvalHookRunner:
+    """
+    Runs the configured evaluation metrics against a model + dataset.
+    Evaluators are resolved from the job config at construction time.
+    Each evaluator is independent; failures in one do not abort others.
+    """
+    def __init__(self, config: TrainingJobConfig, tokenizer: PreTrainedTokenizerBase) -> None:
+        self._config = config
+        self._tokenizer = tokenizer
+        self._evaluators = self._build_evaluators()
+        self._log = log.bind(job=config.job_name)
+    def _build_evaluators(self) -> dict[EvalMetric, Any]:
+        evals: dict[EvalMetric, Any] = {}
+        eval_cfg = self._config.evaluation
+        for metric in eval_cfg.metrics:
+            match metric:
+                case EvalMetric.PASS_AT_1:
+                    evals[metric] = PassAtKEvaluator(k=1, n=eval_cfg.num_samples_per_problem)
+                case EvalMetric.PASS_AT_10:
+                    evals[metric] = PassAtKEvaluator(k=10, n=eval_cfg.num_samples_per_problem)
+                case EvalMetric.BLEU:
+                    evals[metric] = BleuEvaluator()
+                case EvalMetric.EXECUTION_ACCURACY:
+                    evals[metric] = ExecutionAccuracyEvaluator(
+                        timeout=self._config.evaluation.timeout_seconds
+                    )
+                case EvalMetric.EXACT_MATCH:
+                    evals[metric] = ExactMatchEvaluator()
+        return evals
+    def run(
+        self,
+        model: PreTrainedModel,
+        eval_dataset: Dataset,
+        epoch: float,
+        step: int,
+    ) -> EvalResults:
+        start = time.perf_counter()
+        results = EvalResults(job_name=self._config.job_name, epoch=epoch, step=step)
+        model.eval()
+        with torch.no_grad():
+            for metric, evaluator in self._evaluators.items():
+                try:
+                    score = evaluator.evaluate(
+                        model=model,
+                        tokenizer=self._tokenizer,
+                        dataset=eval_dataset,
+                    )
+                    results.metrics[metric.value] = round(score, 4)
+                    self._log.info("eval.metric", metric=metric.value, score=score)
+                except Exception as exc:  # noqa: BLE001
+                    msg = f"{metric.value}: {exc}"
+                    results.errors.append(msg)
+                    self._log.warning("eval.metric_failed", metric=metric.value, error=str(exc))
+        results.duration_seconds = time.perf_counter() - start
+        results.log(self._log)
+        return results
+# ---------------------------------------------------------------------------
+# Custom training callback
+# ---------------------------------------------------------------------------
+class CodeCraftLabCallback(TrainerCallback):
+    """
+    Injects structured logging and eval hooks into the HF Trainer loop.
+    """
+    def __init__(
+        self,
+        hook_runner: EvalHookRunner,
+        eval_dataset: Dataset,
+        results_path: Path,
+    ) -> None:
+        self._runner = hook_runner
+        self._eval_dataset = eval_dataset
+        self._results_path = results_path
+        self._all_results: list[dict[str, Any]] = []
+        self._log = log
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel,
+        **kwargs: Any,
+    ) -> TrainerControl:
+        self._log.info(
+            "training.epoch_end",
+            epoch=state.epoch,
+            step=state.global_step,
+            loss=state.log_history[-1].get("loss") if state.log_history else None,
+        )
+        results = self._runner.run(
+            model=model,
+            eval_dataset=self._eval_dataset,
+            epoch=state.epoch or 0.0,
+            step=state.global_step,
+        )
+        self._all_results.append(results.to_dict())
+        self._persist_results()
+        return control
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: dict[str, float],
+        **kwargs: Any,
+    ) -> TrainerControl:
+        self._log.info("training.log", step=state.global_step, **logs)
+        return control
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs: Any,
+    ) -> TrainerControl:
+        self._log.info(
+            "training.completed",
+            total_steps=state.global_step,
+            total_flos=state.total_flos,
+        )
+        return control
+    def _persist_results(self) -> None:
+        self._results_path.write_text(
+            json.dumps(self._all_results, indent=2), encoding="utf-8"
+        )
+# ---------------------------------------------------------------------------
+# Pipeline
+# ---------------------------------------------------------------------------
+class FineTuningPipeline:
+    """
+    Orchestrates the full fine-tuning lifecycle.
+    Usage:
+        config = TrainingJobConfig.model_validate(raw_dict)
+        pipeline = FineTuningPipeline(config)
+        pipeline.run()
+    """
+    def __init__(self, config: TrainingJobConfig) -> None:
+        self._config = config
+        self._log = log.bind(job=config.job_name, model=config.base_model)
+        self._output_dir = Path(config.checkpoint.output_dir) / config.job_name
+    # ------------------------------------------------------------------
+    # Public entry point
+    # ------------------------------------------------------------------
+    def run(self) -> Path:
+        """Execute all pipeline stages. Returns the final checkpoint path."""
+        self._log.info("pipeline.started")
+        self._preflight()
+        datasets = self._prepare_datasets()
+        model, tokenizer = self._load_model()
+        self._train(model, tokenizer, datasets)
+        final_path = self._export(model, tokenizer)
+        self._log.info("pipeline.finished", output=str(final_path))
+        return final_path
+    # ------------------------------------------------------------------
+    # Stage 1: Preflight
+    # ------------------------------------------------------------------
+    def _preflight(self) -> None:
+        self._log.info("pipeline.preflight")
+        # Validate config (already done at submission, but be defensive)
+        self._config.model_validate(self._config.model_dump())
+        # GPU check
+        if torch.cuda.is_available():
+            device_name = torch.cuda.get_device_name(0)
+            vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+            self._log.info("preflight.gpu", device=device_name, vram_gb=round(vram_gb, 1))
+        else:
+            self._log.warning("preflight.no_gpu", message="Training on CPU — will be slow")
+        # Disk space (rough check — 20 GB minimum)
+        free_gb = shutil.disk_usage(self._output_dir.parent).free / 1e9
+        if free_gb < 20:
+            self._log.warning("preflight.disk_low", free_gb=round(free_gb, 1))
+        # HF token if pushing
+        if self._config.hub.push_to_hub and not os.environ.get("HF_TOKEN"):
+            raise EnvironmentError("HF_TOKEN is required when hub.push_to_hub=true")
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._log.info("preflight.passed")
+    # ------------------------------------------------------------------
+    # Stage 2: Dataset preparation
+    # ------------------------------------------------------------------
+    def _prepare_datasets(self) -> DatasetDict:
+        self._log.info("pipeline.dataset_prep")
+        ds_cfg = self._config.dataset
+        # Load — support both HF Hub paths and internal dataset IDs
+        raw: Dataset
+        if ds_cfg.dataset_id.startswith("ds_"):
+            # Internal dataset — load from local store
+            raw = Dataset.load_from_disk(f"./data/{ds_cfg.dataset_id}")
+        else:
+            raw = load_dataset(ds_cfg.dataset_id, split="train")  # type: ignore[assignment]
+        if ds_cfg.max_samples:
+            raw = raw.select(range(min(ds_cfg.max_samples, len(raw))))
+        if ds_cfg.shuffle:
+            raw = raw.shuffle(seed=ds_cfg.shuffle_seed)
+        n_train = int(len(raw) * ds_cfg.split_ratio)
+        splits = DatasetDict(
+            {
+                "train": raw.select(range(n_train)),
+                "eval": raw.select(range(n_train, len(raw))),
+            }
+        )
+        self._log.info(
+            "dataset.prepared",
+            train_size=len(splits["train"]),
+            eval_size=len(splits["eval"]),
+            column=ds_cfg.text_column,
+        )
+        return splits
+    # ------------------------------------------------------------------
+    # Stage 3: Model initialisation
+    # ------------------------------------------------------------------
+    def _load_model(self) -> tuple[PreTrainedModel, PreTrainedTokenizerBase]:
+        self._log.info("pipeline.model_load")
+        hp = self._config.training
+        dtype_map = {
+            "fp32": torch.float32,
+            "fp16": torch.float16,
+            "bf16": torch.bfloat16,
+        }
+        torch_dtype = dtype_map.get(hp.precision.value, torch.bfloat16)
+        tokenizer = AutoTokenizer.from_pretrained(self._config.base_model)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            self._config.base_model,
+            torch_dtype=torch_dtype,
+            device_map="auto" if torch.cuda.is_available() else "cpu",
+        )
+        if self._config.lora and self._config.lora.enabled:
+            lora_cfg = self._config.lora
+            peft_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM,
+                r=lora_cfg.r,
+                lora_alpha=lora_cfg.alpha,
+                lora_dropout=lora_cfg.dropout,
+                target_modules=lora_cfg.target_modules,
+                bias=lora_cfg.bias,  # type: ignore[arg-type]
+            )
+            model = get_peft_model(model, peft_config)
+            trainable, total = model.get_nb_trainable_parameters()
+            self._log.info(
+                "model.lora_applied",
+                trainable_params=trainable,
+                total_params=total,
+                trainable_pct=round(100 * trainable / total, 2),
+            )
+        else:
+            self._log.info("model.full_finetune")
+        return model, tokenizer  # type: ignore[return-value]
+    # ------------------------------------------------------------------
+    # Stage 4: Training
+    # ------------------------------------------------------------------
+    def _train(
+        self,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+        datasets: DatasetDict,
+    ) -> None:
+        self._log.info("pipeline.training_start")
+        hp = self._config.training
+        ckpt = self._config.checkpoint
+        eval_cfg = self._config.evaluation
+        def tokenize(examples: dict[str, list[str]]) -> dict[str, Any]:
+            return tokenizer(
+                examples[self._config.dataset.text_column],
+                truncation=True,
+                max_length=hp.max_seq_length,
+                padding=False,
+            )
+        tokenized = datasets.map(tokenize, batched=True, remove_columns=datasets["train"].column_names)
+        training_args = TrainingArguments(
+            output_dir=str(self._output_dir),
+            num_train_epochs=hp.num_epochs,
+            per_device_train_batch_size=hp.batch_size,
+            per_device_eval_batch_size=hp.batch_size,
+            gradient_accumulation_steps=hp.gradient_accumulation_steps,
+            learning_rate=hp.learning_rate,
+            weight_decay=hp.weight_decay,
+            warmup_ratio=hp.warmup_ratio,
+            max_grad_norm=hp.max_grad_norm,
+            optim=hp.optimizer.value,
+            lr_scheduler_type=hp.lr_scheduler,
+            fp16=hp.precision.value == "fp16",
+            bf16=hp.precision.value == "bf16",
+            evaluation_strategy=eval_cfg.strategy.value,
+            eval_steps=eval_cfg.eval_steps,
+            save_strategy=ckpt.save_strategy.value,
+            save_steps=ckpt.save_steps,
+            save_total_limit=ckpt.save_total_limit,
+            load_best_model_at_end=eval_cfg.load_best_model_at_end,
+            metric_for_best_model=eval_cfg.metric_for_best_model.value,
+            greater_is_better=eval_cfg.greater_is_better,
+            seed=hp.seed,
+            dataloader_num_workers=hp.dataloader_num_workers,
+            report_to="none",  # structlog handles all logging
+            logging_steps=10,
+            resume_from_checkpoint=ckpt.resume_from_checkpoint,
+            push_to_hub=False,  # push handled separately in export stage
+        )
+        hook_runner = EvalHookRunner(self._config, tokenizer)
+        results_path = self._output_dir / "eval_results.json"
+        callback = CodeCraftLabCallback(
+            hook_runner=hook_runner,
+            eval_dataset=datasets["eval"],
+            results_path=results_path,
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized["train"],
+            eval_dataset=tokenized["eval"],
+            data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            callbacks=[callback],
+        )
+        trainer.train(resume_from_checkpoint=ckpt.resume_from_checkpoint)
+    # ------------------------------------------------------------------
+    # Stage 5: Export + Hub push
+    # ------------------------------------------------------------------
+    def _export(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase) -> Path:
+        self._log.info("pipeline.export")
+        final_path = self._output_dir / "final"
+        model.save_pretrained(str(final_path))
+        tokenizer.save_pretrained(str(final_path))
+        self._log.info("model.saved", path=str(final_path))
+        hub_cfg = self._config.hub
+        if hub_cfg.push_to_hub and hub_cfg.repo_id:
+            self._log.info("hub.pushing", repo_id=hub_cfg.repo_id)
+            model.push_to_hub(hub_cfg.repo_id, private=hub_cfg.private)
+            tokenizer.push_to_hub(hub_cfg.repo_id, private=hub_cfg.private)
+            self._log.info("hub.pushed", repo_id=hub_cfg.repo_id)
+        return final_path