File size: 47,281 Bytes

"""
train.py
--------
Main training entry point.
Implements 2-stage training following RaDialog:
  Stage 1 — Train projection layer only (2 epochs, LR=1e-3)
  Stage 2 — Train projection + LLM LoRA (10 epochs, LR=2e-4)

Supports two datasets, selected via `train_cfg.data.dataset_name`:
  - "MIMIC-CXR"  → all 3 tasks (findings, impression, VQA)
  - "IU-Xray"    → findings + impression only (no VQA)

Checkpoints and results are written under:
    {training.output_root}/{dataset_name}_run_{N}/stageX_*

Usage:
    python -m training.train --config configs/train_config.yaml
"""

import os
import sys
from pathlib import Path

# Silence HF per-shard download tqdm spam — MUST be before transformers/peft/hf_hub imports
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
import utils._quiet  # noqa: F401

import argparse
import torch
from omegaconf import OmegaConf

# Free perf win on A100/H100 (no-op on T4): allow TF32 for fp32 matmul / cuDNN.
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32       = True

import transformers
from transformers import TrainingArguments, Trainer, TrainerCallback, PrinterCallback
from transformers.trainer_callback import ProgressCallback


class _NoEvalTqdmCallback(ProgressCallback):
    """Same as HF's ProgressCallback but with the per-batch eval bar disabled.

    In a Colab `!python -m ...` subprocess HF Trainer's `is_in_notebook()`
    returns False (no IPython kernel in the child) so it falls back to plain
    tqdm. Colab's text renderer mishandles `\\r` for fast updates, so the
    eval bar (~1 batch/sec × 1250 batches) prints a fresh line every step
    and lags the browser tab. Training tqdm updates slowly enough (one bar
    line per ~9s at 24M params + LoRA + bf16) that it stays clean, so we
    only kill the prediction bar. eval_loss is still logged at the end of
    each eval pass via the standard log_history mechanism."""

    def on_prediction_step(self, args, state, control, **kwargs):  # noqa: D401
        return

# Add project root to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from model import CXRVisionLanguageModel
from model.rad_dino import BioViLTEncoder
from data import CXRInstructDataset, CXRDataCollator, ITCDataCollator
from utils.logger import setup_logger
from utils.checkpoint import save_checkpoint, load_checkpoint
from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume, hydrate_run_dir_from_hf
from utils.dataset_resolver import (
    resolve_dataset_spec,
    resolve_run_id,
    save_run_config,
    run_dir,
    stage_dir,
    DatasetSpec,
)


def parse_args():
    parser = argparse.ArgumentParser(description="Train CXR VLM")
    parser.add_argument(
        "--model_config", type=str,
        default="configs/model_config.yaml",
        help="Path to model config"
    )
    parser.add_argument(
        "--train_config", type=str,
        default="configs/train_config.yaml",
        help="Path to training config"
    )
    parser.add_argument(
        "--stage", type=int, default=None,
        help="Run only stage 1 or stage 2 (default: run both). With --mode resume, "
             "the stage is auto-detected and this flag should be left unset."
    )
    parser.add_argument(
        "--mode", type=str, default=None, choices=["fresh", "resume"],
        help="Unified resume controller. 'fresh' → new run_N folder. "
             "'resume' → reuse latest matching run_id (or --run_id), auto-detect "
             "which stage to continue from based on checkpoints on disk. "
             "If unset, behaviour is inferred from --resume_from / --run_id (legacy)."
    )
    parser.add_argument(
        "--resume_from", type=str, default=None,
        help="Path to checkpoint to resume from (legacy; prefer --mode resume)"
    )
    parser.add_argument(
        "--run_id", type=str, default=None,
        help="Explicit run id (e.g. 'IU-Xray_run_3'). If unset, auto-resolve."
    )
    parser.add_argument(
        "--resume_from_hf", action="store_true",
        help="Pull <run_id>/<stage>/last/ from the HF Hub and resume from it. "
             "Use when training on a fresh VM after the previous one was killed."
    )
    return parser.parse_args()


# ─── Resume-point auto-detection ────────────────────────────────────────────

def _list_checkpoints(stage_dir):
    """Return [Path, …] of `checkpoint-NNN` folders sorted ascending by step."""
    if not stage_dir.is_dir():
        return []
    out = []
    for p in stage_dir.iterdir():
        if not p.is_dir() or not p.name.startswith("checkpoint-"):
            continue
        suffix = p.name.split("-", 1)[1]
        if suffix.isdigit():
            out.append((int(suffix), p))
    return [p for _, p in sorted(out)]


def detect_resume_point(run_dir_path, stage1_subdir, stage2_subdir):
    """
    Inspect the run dir on disk and decide where to pick up training.

    Returns a tuple `(target_stage, ckpt_path)` where:
        target_stage  : "stage1" | "stage2" | "done"
        ckpt_path     : Path to the checkpoint folder to pass to HF Trainer,
                        or None if the stage should start from scratch.

    Priority:
      1. stage 2 final saved   → ("done", None)         everything finished
      2. stage 2 has ckpts     → ("stage2", latest)     resume mid-stage2
      3. stage 1 final saved   → ("stage2", None)       stage 1 done; start stage 2
      4. stage 1 has ckpts     → ("stage1", latest)     resume mid-stage1
      5. otherwise             → ("stage1", None)       brand-new run
    """
    from pathlib import Path as _P
    run_dir_path = _P(run_dir_path)
    s1d = run_dir_path / stage1_subdir
    s2d = run_dir_path / stage2_subdir

    if (s2d / "stage2_final_projection.pt").exists():
        return ("done", None)

    s2_ckpts = _list_checkpoints(s2d)
    if s2_ckpts:
        return ("stage2", s2_ckpts[-1])

    if (s1d / "stage1_final_projection.pt").exists():
        return ("stage2", None)

    s1_ckpts = _list_checkpoints(s1d)
    if s1_ckpts:
        return ("stage1", s1_ckpts[-1])

    return ("stage1", None)


def compute_training_plan(train_cfg, instruct_json_path):
    """
    Compute a coarse plan of total optimizer steps across stage 1 + stage 2,
    derived from the train_config + the train-split sample count in the
    instruct JSON. Used to print a human-readable summary at startup.

    Returns a dict (all ints) — gracefully handles missing fields.
    """
    import json as _json
    tr = train_cfg.training
    try:
        with open(instruct_json_path, "r", encoding="utf-8") as f:
            all_samples = _json.load(f)
        train_count = sum(1 for s in all_samples if s.get("split") == "train")
    except Exception:
        train_count = 0

    def _eff_batch(stage_cfg, itc_override=None):
        # ITC override (stage1.itc.*) wins; else per-stage override; else global.
        if itc_override is not None:
            bs = int(itc_override.get("per_device_train_batch_size",
                                      _cfg(stage_cfg, tr, "per_device_train_batch_size", 1)))
            ga = int(itc_override.get("gradient_accumulation_steps",
                                      _cfg(stage_cfg, tr, "gradient_accumulation_steps", 1)))
        else:
            bs = int(_cfg(stage_cfg, tr, "per_device_train_batch_size", 1))
            ga = int(_cfg(stage_cfg, tr, "gradient_accumulation_steps", 1))
        return max(1, bs * ga)

    s1_cfg = train_cfg.stage1
    s2_cfg = train_cfg.stage2
    s1_enabled = bool(getattr(s1_cfg, "enabled", True))
    s2_enabled = bool(getattr(s2_cfg, "enabled", True))
    s1_epochs  = int(getattr(s1_cfg, "num_epochs", 0)) if s1_enabled else 0
    s2_epochs  = int(getattr(s2_cfg, "num_epochs", 0)) if s2_enabled else 0

    itc_cfg = s1_cfg.get("itc", None)
    itc_on  = bool(itc_cfg and itc_cfg.get("enabled", False))
    s1_eff  = _eff_batch(s1_cfg, itc_cfg if itc_on else None)
    s2_eff  = _eff_batch(s2_cfg)

    s1_spe  = max(1, (train_count + s1_eff - 1) // s1_eff)
    s2_spe  = max(1, (train_count + s2_eff - 1) // s2_eff)
    s1_steps = s1_spe * s1_epochs
    s2_steps = s2_spe * s2_epochs
    return {
        "train_samples":     train_count,
        "effective_batch":   s2_eff,          # representative (Stage-2 / LM path)
        "stage1_eff_batch":  s1_eff,
        "stage2_eff_batch":  s2_eff,
        "steps_per_epoch":   s2_spe,
        "stage1_steps":      s1_steps,
        "stage2_steps":      s2_steps,
        "total_steps":       s1_steps + s2_steps,
        "stage1_epochs":     s1_epochs,
        "stage2_epochs":     s2_epochs,
        "itc_stage1":        itc_on,
    }


def _fmt_plan_banner(plan, run_id, target_stage, resume_ckpt):
    s1, s2, tot = plan["stage1_steps"], plan["stage2_steps"], plan["total_steps"]
    head = f"TRAINING PLAN — {run_id}"
    sep  = "=" * max(len(head) + 4, 60)
    cur  = ""

    # Prefer real global_step from trainer_state.json over parsing the folder
    # name — hydrate_run_dir_from_hf uses "checkpoint-1" as a placeholder
    # regardless of actual step, so the folder digit is meaningless.
    def _ckpt_step(ckpt):
        if not ckpt:
            return None
        try:
            import json as _json
            from pathlib import Path as _P
            ts = _P(str(ckpt)) / "trainer_state.json"
            if ts.is_file():
                return int(_json.load(open(ts))["global_step"])
        except Exception:
            pass
        # Fallback: parse folder suffix
        suf = str(ckpt).split("-")[-1]
        return int(suf) if suf.isdigit() else None

    real_step = _ckpt_step(resume_ckpt)
    if target_stage == "stage1":
        offset = real_step if real_step is not None else 0
        cur = f"Resuming at step {offset} / {tot}  (inside stage 1)"
    elif target_stage == "stage2":
        offset = (s1 + real_step) if real_step is not None else s1
        cur = f"Resuming at step {offset} / {tot}  (inside stage 2)"
    elif target_stage == "done":
        cur = f"All {tot} steps already complete — nothing to do"

    s1_eff = plan.get("stage1_eff_batch", plan["effective_batch"])
    s2_eff = plan.get("stage2_eff_batch", plan["effective_batch"])
    itc_tag = "  [ITC]" if plan.get("itc_stage1") else ""
    lines = [
        sep, f"  {head}", sep,
        f"  Train samples       : {plan['train_samples']:,}",
        f"  Stage 1{itc_tag:<6}      : {plan['stage1_epochs']} epochs → {s1} steps  "
        f"(eff.batch {s1_eff}; global steps 1–{s1})",
        f"  Stage 2             : {plan['stage2_epochs']} epochs → {s2} steps  "
        f"(eff.batch {s2_eff}; global steps {s1+1}–{tot})",
        f"  TOTAL               : {tot} optimizer steps",
    ]
    if cur:
        lines += ["  " + "─" * (len(sep) - 4), f"  {cur}"]
    lines.append(sep)
    return "\n".join(lines)


def get_trainer(
    model,
    train_dataset,
    val_dataset,
    collator,
    training_args: TrainingArguments,
    itc_mode: bool = False,
    itc_temperature: float = 0.07,
) -> Trainer:
    """Build a HuggingFace Trainer.

    When `itc_mode=True` the loss is a symmetric image-text InfoNCE
    (Stage-1 contrastive alignment); otherwise it's the causal-LM loss.
    """

    class CXRTrainer(Trainer):
        """Custom Trainer that passes images to model.forward(), and saves
        only the trainable artifacts (projection MLP + LoRA adapters) instead
        of the full ~5 GB Vicuna state dict that base Trainer would dump.
        Resume re-loads the same artifacts."""

        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            if itc_mode:
                return self._itc_loss(model, inputs, return_outputs)
            outputs = model(
                images         = inputs["images"],
                input_ids      = inputs["input_ids"],
                attention_mask = inputs["attention_mask"],
                labels         = inputs["labels"],
            )
            loss = outputs["loss"]
            return (loss, outputs) if return_outputs else loss

        def _itc_loss(self, model, inputs, return_outputs=False):
            """Symmetric image-text InfoNCE (CLIP/BLIP-2 ITC style).

            Image embeds come from model.forward_itc; text embeds are the
            precomputed (already L2-normed) CXR-BERT vectors. The diagonal of
            the (B, B) similarity matrix is the positive pair (dataset is
            deduped to one image per study, so no in-batch collisions)."""
            import torch.nn.functional as F
            mdl = model.module if hasattr(model, "module") else model
            img = mdl.forward_itc(inputs["images"])                 # (B, d) normed
            txt = F.normalize(inputs["text_embeds"].to(img.dtype), dim=-1)  # (B, d)
            logit_scale = 1.0 / max(itc_temperature, 1e-4)
            logits = logit_scale * img @ txt.t()                    # (B, B)
            tgt    = torch.arange(logits.size(0), device=logits.device)
            loss   = 0.5 * (F.cross_entropy(logits, tgt) +
                            F.cross_entropy(logits.t(), tgt))
            return (loss, {"loss": loss, "logits": logits}) if return_outputs else loss

        def prediction_step(self, model, inputs, prediction_loss_only,
                            ignore_keys=None):
            # ITC eval has no token "labels"; force loss-only eval so
            # metric_for_best_model="eval_loss" works.
            if itc_mode:
                with torch.no_grad():
                    loss = self.compute_loss(model, inputs)
                return (loss.detach(), None, None)
            return super().prediction_step(model, inputs, prediction_loss_only,
                                           ignore_keys=ignore_keys)

        def _save(self, output_dir=None, state_dict=None):
            output_dir = output_dir if output_dir is not None else self.args.output_dir
            Path(output_dir).mkdir(parents=True, exist_ok=True)
            # projection (.pt) + LoRA folder + optional CheXpert classifier
            save_checkpoint(self.model, output_dir, name="checkpoint")
            # TrainingArguments dump — needed for resume sanity check
            torch.save(self.args, Path(output_dir) / "training_args.bin")

        def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
            # Bypass upstream's WEIGHTS_NAME / SAFE_WEIGHTS_NAME existence
            # check entirely; we never write those files.
            if model is None:
                model = self.model
            load_checkpoint(model, resume_from_checkpoint)

        def _get_train_sampler(self, *args, **kwargs):
            """
            Use `WeightedRandomSampler` when the train dataset is mixed-task
            and exposes per-sample weights — this is what makes the configured
            `tasks.*.weight` ratios actually control batch composition.
            Falls back to HF's default (RandomSampler / DistributedSampler)
            for single-task or eval-time datasets.

            Notes:
              * Eval is unaffected — HF's `_get_eval_sampler` returns a
                `SequentialSampler` by default, so weighted reweighting only
                applies to training.
              * `replacement=True` is required for true oversampling — without
                it you can't draw more samples of a rare-but-upweighted task
                than physically exist. Tradeoff: a small fraction of samples
                in a numerous-but-downweighted task may never appear in a
                given epoch. Acceptable across multiple epochs.
            """
            ds = self.train_dataset
            getter = getattr(ds, "get_per_sample_weights", None)
            if getter is not None:
                weights = getter()
                if weights is not None:
                    from torch.utils.data import WeightedRandomSampler
                    return WeightedRandomSampler(
                        weights     = weights,
                        num_samples = len(ds),
                        replacement = True,
                    )
            return super()._get_train_sampler(*args, **kwargs)

    trainer = CXRTrainer(
        model           = model,
        args            = training_args,
        train_dataset   = train_dataset,
        eval_dataset    = val_dataset,
        data_collator   = collator,
    )
    trainer.remove_callback(PrinterCallback)
    # ── Eval progress bar policy ─────────────────────────────────────────
    # Default: KEEP the eval per-batch tqdm bar so users can see eval
    # actually running on long val sets (22k MIMIC-CXR samples → ~25-45 min
    # of silent eval is hard to distinguish from a hang).
    # Set DISABLE_EVAL_TQDM=1 to restore the suppressor (originally added
    # because Colab's text renderer spams \r-updates as new lines — only
    # matters in a Colab `!python -m ...` subprocess; native notebooks,
    # Lightning, and Vertex handle \r fine).
    if os.environ.get("DISABLE_EVAL_TQDM", "0") == "1":
        trainer.remove_callback(ProgressCallback)
        trainer.add_callback(_NoEvalTqdmCallback())
    return trainer


def _cfg(stage_cfg, tr, key, default=None):
    """
    Resolve a training hyperparameter with per-stage override semantics:
    use the value from `stage_cfg` (stage1/stage2) if present, else fall back
    to the global `training:` block, else `default`. This lets stages share
    machine-level settings (fp16, dataloader_*) while overriding stage-specific
    ones (batch size, warmup, optim) — see train_config.yaml docs.
    """
    if stage_cfg is not None:
        v = stage_cfg.get(key, None)
        if v is not None:
            return v
    return tr.get(key, default)


def _build_training_args(train_cfg, stage_cfg, out_dir, run_name, *, enable_best,
                         overrides=None):
    """
    Build TrainingArguments from config. Save / eval cadence, total limit,
    best-model logic come from `train_cfg.training`; per-stage values
    (batch, lr, warmup, optim, ...) resolve via `_cfg` (stage override →
    global). `overrides` (dict) wins over both — used for the ITC Stage-1
    batch bump. `enable_best` toggles load_best_model_at_end.
    """
    tr = train_cfg.training
    overrides = overrides or {}

    def pick(key, default=None):
        if key in overrides:
            return overrides[key]
        return _cfg(stage_cfg, tr, key, default)

    save_strategy = getattr(tr, "save_strategy", "steps")
    eval_strategy = getattr(tr, "evaluation_strategy", save_strategy)
    save_steps    = getattr(tr, "save_steps", 200)
    eval_steps    = getattr(tr, "eval_steps", save_steps)

    kwargs = dict(
        output_dir                  = out_dir,
        num_train_epochs            = stage_cfg.num_epochs,
        per_device_train_batch_size = pick("per_device_train_batch_size", 1),
        per_device_eval_batch_size  = pick("per_device_eval_batch_size", 1),
        gradient_accumulation_steps = pick("gradient_accumulation_steps", 1),
        learning_rate               = stage_cfg.learning_rate,
        lr_scheduler_type           = pick("lr_scheduler_type", "cosine"),
        warmup_ratio                = pick("warmup_ratio", 0.0),
        weight_decay                = pick("weight_decay", 0.0),
        fp16                        = tr.fp16,
        bf16                        = getattr(tr, "bf16", False),
        save_strategy               = save_strategy,
        eval_strategy               = eval_strategy,
        logging_steps               = tr.logging_steps,
        save_total_limit            = getattr(tr, "save_total_limit", 1),
        report_to                   = "wandb" if train_cfg.wandb.enabled else "none",
        run_name                    = run_name,
        dataloader_num_workers      = getattr(tr, "dataloader_num_workers", 4),
        dataloader_pin_memory       = getattr(tr, "dataloader_pin_memory", True),
        dataloader_persistent_workers = (
            getattr(tr, "dataloader_persistent_workers", True)
            and getattr(tr, "dataloader_num_workers", 4) > 0
        ),
        # `paged_adamw_8bit` (bnb) cuts optimizer-state VRAM ~4× with no
        # measurable quality loss (Dettmers ICLR'22). Default keeps the
        # legacy fp32 AdamW for backward-compat; the auto-detect cell in
        # the Colab notebook switches it on for Ampere+ GPUs.
        optim                       = pick("optim", "adamw_torch"),
        remove_unused_columns       = False,
    )
    if save_strategy == "steps":
        kwargs["save_steps"] = save_steps
    if eval_strategy == "steps":
        kwargs["eval_steps"] = eval_steps
    if enable_best:
        kwargs["load_best_model_at_end"] = getattr(tr, "load_best_model_at_end", True)
        kwargs["metric_for_best_model"]  = getattr(tr, "metric_for_best_model", "eval_loss")
        kwargs["greater_is_better"]      = getattr(tr, "greater_is_better", False)
    return TrainingArguments(**kwargs)


class HFBestLastCallback(TrainerCallback):
    """
    Maintain exactly two checkpoint folders + a training log on HF Hub for
    each stage:

        <run_id>/<stage>/last/         ← latest checkpoint (for resume)
        <run_id>/<stage>/best/         ← lowest eval_loss so far (for inference)
        <run_id>/<stage>/best/best_meta.json
        <run_id>/<stage>/training_log.jsonl

    `last/` is overwritten on every save. `best/` is overwritten only when
    `metric_for_best` improves. No checkpoint-<step>/ folders accumulate.

    Each upload first deletes the remote folder so orphan files (e.g. a
    `optimizer.pt` from a previous step that no longer exists locally) are
    purged.
    """

    def __init__(self, tracker, stage_subdir: str, logger,
                 metric_for_best: str = "eval_loss",
                 greater_is_better: bool = False):
        self.tracker           = tracker
        self.stage_subdir      = stage_subdir
        self.logger            = logger
        self.metric_for_best   = metric_for_best
        self.greater_is_better = greater_is_better
        self.best_metric       = None
        self.best_step         = None

    def _is_better(self, m: float) -> bool:
        if self.best_metric is None:
            return True
        return (m > self.best_metric) if self.greater_is_better else (m < self.best_metric)

    def on_evaluate(self, args, state, control, metrics=None, **kw):
        if self.tracker is None or not metrics:
            return
        m = metrics.get(self.metric_for_best)
        if m is None:
            return
        if self._is_better(m):
            self.best_metric = float(m)
            self.best_step   = state.global_step

    def on_save(self, args, state, control, **kw):
        if self.tracker is None:
            return
        ckpt_dir = Path(args.output_dir) / f"checkpoint-{state.global_step}"
        if not ckpt_dir.exists():
            return

        # ── last/ — every save ────────────────────────────────────────
        try:
            self.tracker.delete_remote(f"{self.stage_subdir}/last")
            self.tracker.upload_folder(str(ckpt_dir), f"{self.stage_subdir}/last")
        except Exception as e:
            self.logger.warning(f"[HF upload] last @ step {state.global_step} failed: {e}")

        # ── best/ — only if this step is the new best ────────────────
        if self.best_step == state.global_step:
            try:
                self.tracker.delete_remote(f"{self.stage_subdir}/best")
                self.tracker.upload_folder(str(ckpt_dir), f"{self.stage_subdir}/best")
                self.tracker.upload_json(
                    {
                        "step":               state.global_step,
                        "epoch":              state.epoch,
                        self.metric_for_best: self.best_metric,
                    },
                    f"{self.stage_subdir}/best/best_meta.json",
                )
            except Exception as e:
                self.logger.warning(f"[HF upload] best @ step {state.global_step} failed: {e}")

        # ── training log — full log_history each save ────────────────
        try:
            self.tracker.upload_jsonl(
                state.log_history,
                f"{self.stage_subdir}/training_log.jsonl",
            )
        except Exception as e:
            self.logger.warning(f"[HF upload] training_log failed: {e}")


def _build_datasets(spec: DatasetSpec, train_cfg, model, transform_train, transform_val,
                    itc_mode: bool = False, itc_text_cache=None):
    """Construct train + val CXRInstructDataset instances from a DatasetSpec.

    When `itc_mode=True`, datasets yield (image, precomputed text_embed) pairs
    for Stage-1 contrastive alignment instead of tokenized prompt/target."""
    feature_cache_dir = getattr(train_cfg.data, "feature_cache_dir", None) or None
    if feature_cache_dir:
        print(f"[_build_datasets] feature_cache_dir = {feature_cache_dir}  "
              f"(encoder bypass on cache hit)")
    common = dict(
        data_path    = spec.instruct_json,
        image_root   = spec.image_root,
        tokenizer    = model.tokenizer,
        task         = "mixed",
        cutoff_len   = train_cfg.training.cutoff_len,
        task_weights = spec.task_weights,
        max_images   = spec.max_images,
        feature_cache_dir = feature_cache_dir,
        itc_mode     = itc_mode,
        itc_text_cache = itc_text_cache,
    )
    train_ds = CXRInstructDataset(transform=transform_train,
                                  split=train_cfg.data.train_split, **common)
    val_ds   = CXRInstructDataset(transform=transform_val,
                                  split=train_cfg.data.val_split, **common)
    return train_ds, val_ds


def _stage1_itc_cfg(train_cfg):
    """Return the stage1.itc DictConfig if ITC is enabled, else None."""
    itc = train_cfg.stage1.get("itc", None)
    if itc and itc.get("enabled", False):
        return itc
    return None


def run_stage1(model, train_cfg, model_cfg, spec, out_dir, logger, tracker=None, resume_from=None):
    """
    Stage 1: align the projection.
      • default        → train projection with the causal-LM loss (Vicuna fwd).
      • stage1.itc on  → train projection + ITC head with InfoNCE against
                         precomputed CXR-BERT text embeddings (no Vicuna).
    Vision encoder is frozen in both.
    """
    itc = _stage1_itc_cfg(train_cfg)
    mode_tag = "ITC contrastive" if itc else "projection only (LM loss)"
    logger.info("=" * 60)
    logger.info(f"STAGE 1: {mode_tag}  [{spec.dataset_name}]")
    logger.info(f"  output_dir = {out_dir}")
    logger.info("=" * 60)

    overrides = None
    if itc:
        model.set_stage1_itc_mode()
        text_cache = itc.get("text_embed_cache", None)
        if not text_cache:
            raise ValueError("stage1.itc.enabled=true requires stage1.itc.text_embed_cache")
        train_ds, val_ds = _build_datasets(
            spec, train_cfg, model,
            transform_train = BioViLTEncoder.get_transform("train"),
            transform_val   = BioViLTEncoder.get_transform("val"),
            itc_mode        = True,
            itc_text_cache  = text_cache,
        )
        collator = ITCDataCollator()
        # ITC batch overrides (no Vicuna → big batch). Fall through to stage/global.
        overrides = {
            k: itc.get(k) for k in (
                "per_device_train_batch_size", "per_device_eval_batch_size",
                "gradient_accumulation_steps")
            if itc.get(k, None) is not None
        }
    else:
        model.set_stage1_mode()
        train_ds, val_ds = _build_datasets(
            spec, train_cfg, model,
            transform_train = BioViLTEncoder.get_transform("train"),
            transform_val   = BioViLTEncoder.get_transform("val"),
        )
        collator = CXRDataCollator(pad_token_id=model.tokenizer.pad_token_id)

    model.print_trainable_params()

    training_args = _build_training_args(
        train_cfg, train_cfg.stage1, out_dir,
        run_name    = f"{spec.dataset_name}-{train_cfg.wandb.run_name}-stage1",
        enable_best = True,
        overrides   = overrides,
    )

    trainer = get_trainer(
        model, train_ds, val_ds, collator, training_args,
        itc_mode        = bool(itc),
        itc_temperature = float(itc.get("temperature", 0.07)) if itc else 0.07,
    )
    if getattr(train_cfg.training, "upload_intermediate_to_hf", False) and tracker is not None:
        tr = train_cfg.training
        trainer.add_callback(HFBestLastCallback(
            tracker,
            stage_subdir      = "stage1",
            logger            = logger,
            metric_for_best   = getattr(tr, "metric_for_best_model", "eval_loss"),
            greater_is_better = getattr(tr, "greater_is_better", False),
        ))
    if resume_from:
        logger.info(f"Resuming stage1 from checkpoint: {resume_from}")
        trainer.train(resume_from_checkpoint=resume_from)
    else:
        trainer.train()

    save_checkpoint(model, out_dir, "stage1_final")
    logger.info(f"Stage 1 complete. Checkpoint saved to {out_dir}")

    # ── HF Hub upload: stage1 final → overwrite best/ ────────────────
    # With load_best_model_at_end=True, the in-memory model after train()
    # is the best one; save_checkpoint just dumped it as stage1_final_*.
    # Upload those files into stage1/best/ under the canonical artifact
    # names that load_checkpoint(name="checkpoint") expects.
    if tracker is not None:
        s1 = Path(out_dir)
        tracker.delete_remote("stage1/best")
        tracker.upload_file(
            str(s1 / "stage1_final_projection.pt"),
            "stage1/best/checkpoint_projection.pt",
        )
        # ITC mode → no LoRA folder; upload the ITC head instead.
        lora_dir = s1 / "stage1_final_lora"
        if lora_dir.is_dir():
            tracker.upload_folder(str(lora_dir), "stage1/best/checkpoint_lora")
        itc_head_pt = s1 / "stage1_final_itc_head.pt"
        if itc_head_pt.exists():
            tracker.upload_file(str(itc_head_pt), "stage1/best/checkpoint_itc_head.pt")
        chexpert_pt = s1 / "stage1_final_chexpert_classifier.pt"
        if chexpert_pt.exists():
            tracker.upload_file(
                str(chexpert_pt),
                "stage1/best/checkpoint_chexpert_classifier.pt",
            )
        tracker.write_meta({
            "dataset_name":      spec.dataset_name,
            "stage1_done":       True,
            "stage1_mode":       "itc" if itc else "lm",
            "stage1_output_dir": out_dir,
            "stage1_epochs":     train_cfg.stage1.num_epochs,
            "stage1_lr":         train_cfg.stage1.learning_rate,
        })
    return model


def run_stage2(model, train_cfg, model_cfg, spec, out_dir, logger,
               resume_from=None, tracker=None):
    """
    Stage 2: Train projection + LLM LoRA (instruction tuning).
    Vision encoder frozen. LLM trained via LoRA adapters.
    """
    logger.info("=" * 60)
    logger.info(f"STAGE 2: Instruction tuning (projection + LoRA)  [{spec.dataset_name}]")
    logger.info(f"  output_dir = {out_dir}")
    logger.info("=" * 60)

    model.set_stage2_mode()
    model.print_trainable_params()

    train_ds, val_ds = _build_datasets(
        spec, train_cfg, model,
        transform_train = BioViLTEncoder.get_transform("train"),
        transform_val   = BioViLTEncoder.get_transform("val"),
    )

    training_args = _build_training_args(
        train_cfg, train_cfg.stage2, out_dir,
        run_name    = f"{spec.dataset_name}-{train_cfg.wandb.run_name}-stage2",
        enable_best = True,
    )

    collator = CXRDataCollator(pad_token_id=model.tokenizer.pad_token_id)
    trainer  = get_trainer(model, train_ds, val_ds, collator, training_args)
    if getattr(train_cfg.training, "upload_intermediate_to_hf", False) and tracker is not None:
        tr = train_cfg.training
        trainer.add_callback(HFBestLastCallback(
            tracker,
            stage_subdir      = "stage2",
            logger            = logger,
            metric_for_best   = getattr(tr, "metric_for_best_model", "eval_loss"),
            greater_is_better = getattr(tr, "greater_is_better", False),
        ))

    if resume_from:
        trainer.train(resume_from_checkpoint=resume_from)
    else:
        trainer.train()

    save_checkpoint(model, out_dir, "stage2_final")
    logger.info(f"Stage 2 complete. Checkpoint saved to {out_dir}")

    # ── HF Hub upload: stage2 final → overwrite best/ ────────────────
    if tracker is not None:
        s2 = Path(out_dir)
        tracker.delete_remote("stage2/best")
        tracker.upload_file(
            str(s2 / "stage2_final_projection.pt"),
            "stage2/best/checkpoint_projection.pt",
        )
        tracker.upload_folder(
            str(s2 / "stage2_final_lora"),
            "stage2/best/checkpoint_lora",
        )
        chexpert_pt = s2 / "stage2_final_chexpert_classifier.pt"
        if chexpert_pt.exists():
            tracker.upload_file(
                str(chexpert_pt),
                "stage2/best/checkpoint_chexpert_classifier.pt",
            )
        tracker.write_meta({
            "dataset_name":      spec.dataset_name,
            "stage2_done":       True,
            "stage2_output_dir": out_dir,
            "stage2_epochs":     train_cfg.stage2.num_epochs,
            "stage2_lr":         train_cfg.stage2.learning_rate,
        })
    return model


def main():
    args = parse_args()
    logger = setup_logger("cxr_vlm_train")

    # Load configs
    model_cfg = OmegaConf.load(args.model_config)
    train_cfg = OmegaConf.load(args.train_config)

    logger.info(f"Model config: {args.model_config}")
    logger.info(f"Train config: {args.train_config}")

    # ── Resolve dataset spec (paths, tasks, weights) ─────────────────
    spec = resolve_dataset_spec(train_cfg)
    logger.info(f"Dataset: {spec.dataset_name}")
    logger.info(f"  image_root    = {spec.image_root}")
    logger.info(f"  instruct_json = {spec.instruct_json}")
    logger.info(f"  tasks         = {spec.tasks}")
    logger.info(f"  task_weights  = {spec.task_weights}")

    # ── Resolve per-dataset run_id (e.g. IU-Xray_run_3) ──────────────
    output_root = str(train_cfg.training.get("output_root", "checkpoints"))
    state_file  = str(train_cfg.hf_hub.run_state_file)
    hf_token    = os.environ.get(
        train_cfg.hf_hub.token_env, os.environ.get("HF_TOKEN")
    ) if train_cfg.hf_hub.enabled else None
    hf_repo_id  = train_cfg.hf_hub.repo_id if train_cfg.hf_hub.enabled else None

    # Unified --mode controller. Falls back to the legacy inference (any of
    # --resume_from / --resume_from_hf set ⇒ resuming) when --mode is unset.
    if args.mode == "resume":
        resuming = True
    elif args.mode == "fresh":
        resuming = False
    else:
        resuming = bool(args.resume_from) or args.resume_from_hf

    run_id = resolve_run_id(
        dataset_name = spec.dataset_name,
        output_root  = output_root,
        state_file   = state_file,
        resuming     = resuming,
        explicit     = args.run_id,
        hf_repo_id   = hf_repo_id,
        hf_token     = hf_token,
    )
    logger.info(f"run_id = {run_id}")

    # ── Optional: pull last/ from HF Hub for resume on a fresh VM ────
    if args.resume_from_hf:
        if not train_cfg.hf_hub.enabled or not train_cfg.hf_hub.repo_id:
            sys.exit("--resume_from_hf requires hf_hub.enabled=true and a repo_id")
        target_stage_subdir = (
            str(train_cfg.stage1.get("subdir", "stage1_projection"))
            if args.stage == 1
            else str(train_cfg.stage2.get("subdir", "stage2_instruct"))
        )
        # On the hub the stage subfolder is just "stage1" / "stage2"
        hub_stage_dir = "stage1" if args.stage == 1 else "stage2"
        local_resume = pull_last_for_resume(
            repo_id      = train_cfg.hf_hub.repo_id,
            token        = os.environ.get(train_cfg.hf_hub.token_env, os.environ.get("HF_TOKEN")),
            run_id       = run_id,
            stage_subdir = hub_stage_dir,
            local_root   = str(Path(output_root) / "_resume_from_hf"),
        )
        if local_resume is None:
            sys.exit(f"Could not pull {run_id}/{hub_stage_dir}/last/ from HF — abort.")
        args.resume_from = local_resume
        logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")

    # ── Fresh-VM resume: hydrate from HF before detect_resume_point ──
    # When `--mode resume` is set but the local run dir is empty (Colab
    # persistence lost, switching machines), pull configs + last/best
    # checkpoints from HF Hub into the canonical local layout so the
    # detector finds them. No-op if local already has artifacts or HF
    # tracking is disabled.
    if args.mode == "resume" and hf_repo_id and hf_token:
        try:
            hydrate_run_dir_from_hf(
                repo_id       = hf_repo_id,
                token         = hf_token,
                run_id        = run_id,
                output_root   = output_root,
                stage1_subdir = str(train_cfg.stage1.get("subdir", "stage1_projection")),
                stage2_subdir = str(train_cfg.stage2.get("subdir", "stage2_instruct")),
            )
        except Exception as e:
            logger.warning(f"[resume hydrate] {type(e).__name__}: {e}")

    # ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
    stage1_out = stage_dir(output_root, run_id,
                           str(train_cfg.stage1.get("subdir", "stage1_projection")))
    stage2_out = stage_dir(output_root, run_id,
                           str(train_cfg.stage2.get("subdir", "stage2_instruct")))

    # ── Auto-detect where to resume from (when --mode resume) ─────────
    # Examines disk state inside {output_root}/{run_id}/ and chooses:
    #   • stage1 from scratch / stage1 mid-checkpoint
    #   • stage2 from scratch (stage1 done) / stage2 mid-checkpoint
    #   • done (both stages finished — skip everything)
    # If the user passed --stage explicitly, that wins over auto-detect.
    auto_target_stage = None
    auto_resume_ckpt  = None
    if args.mode == "resume" and args.stage is None:
        auto_target_stage, auto_resume_ckpt = detect_resume_point(
            run_dir(output_root, run_id),
            str(train_cfg.stage1.get("subdir", "stage1_projection")),
            str(train_cfg.stage2.get("subdir", "stage2_instruct")),
        )
        logger.info(
            f"[resume autodetect] target={auto_target_stage} "
            f"ckpt={auto_resume_ckpt}"
        )

    # ── Pretty plan banner (total steps across both stages) ───────────
    plan = compute_training_plan(train_cfg, spec.instruct_json)
    logger.info("\n" + _fmt_plan_banner(plan, run_id,
                                        auto_target_stage or "stage1",
                                        auto_resume_ckpt))

    if auto_target_stage == "done":
        logger.info("Both stages already complete for this run. Exiting cleanly.")
        return

    # ── Snapshot resolved config into the run dir ────────────────────
    # Every run gets its own self-describing folder so we never have to ask
    # "what config did IU-Xray_run_3 actually use?" — open run_meta.json.
    # Written AFTER stage dirs are created so the run dir definitely exists.
    save_run_config(
        run_dir_path = run_dir(output_root, run_id),
        spec         = spec,
        model_cfg    = model_cfg,
        train_cfg    = train_cfg,
        extra        = {
            "stage_arg":         args.stage,
            "resumed":           bool(args.resume_from) or args.resume_from_hf,
            "resume_from":       args.resume_from,
            "resume_from_hf":    args.resume_from_hf,
            "model_config_path": args.model_config,
            "train_config_path": args.train_config,
        },
    )

    # Setup WandB
    if train_cfg.wandb.enabled:
        os.environ["WANDB_PROJECT"] = train_cfg.wandb.project

    # ── HuggingFace Hub tracker (optional) ───────────────────────────
    # Pass our resolved run_id as `explicit_run_id` so the tracker uses
    # the same dataset-prefixed folder on the hub.
    tracker = build_tracker_from_cfg(
        train_cfg,
        resuming        = bool(args.resume_from),
        explicit_run_id = run_id,
    )
    if tracker is not None:
        tracker.write_meta({
            "dataset_name": spec.dataset_name,
            "run_id":       run_id,
            "config_model": args.model_config,
            "config_train": args.train_config,
            "resumed":      bool(args.resume_from),
            "resume_from":  args.resume_from,
        })
        # Snapshot the resolved config + run_meta.json to HF so the run is
        # self-describing on the hub (you can answer "what config did
        # {run_id} actually use?" without pulling the whole checkpoint).
        # `save_run_config` writes these into {run_dir}/configs/ +
        # {run_dir}/run_meta.json a few lines above.
        rd = run_dir(output_root, run_id)
        if (rd / "configs").is_dir():
            tracker.upload_folder(str(rd / "configs"), "configs")
        if (rd / "run_meta.json").is_file():
            tracker.upload_file(str(rd / "run_meta.json"), "run_meta.json")

    # NOTE: model is built BELOW, after the stage-selection logic, so we know
    # whether Stage 1 runs in ITC mode (→ build a light model without Vicuna).

    # Run training stages
    #
    # Stage selection priority:
    #   1. Explicit --stage from CLI wins.
    #   2. --mode resume + auto-detect: skip stage1 when its final ckpt exists,
    #      resume stage1/stage2 from `auto_resume_ckpt` as detected above.
    #   3. Otherwise: enabled flags from train_cfg drive it (legacy: run both).
    if args.stage is not None:
        run_s1 = (args.stage == 1) and train_cfg.stage1.enabled
        run_s2 = (args.stage == 2) and train_cfg.stage2.enabled
    elif auto_target_stage == "stage2":
        # Stage 1 finished previously — skip it entirely.
        run_s1 = False
        run_s2 = train_cfg.stage2.enabled
    else:
        run_s1 = train_cfg.stage1.enabled
        run_s2 = train_cfg.stage2.enabled

    # Decide the resume checkpoint each stage should use.
    # Manual --resume_from still wins when --stage is given explicitly.
    s1_resume_path = None
    s2_resume_path = None
    if args.stage == 1:
        s1_resume_path = args.resume_from
    elif args.stage == 2:
        s2_resume_path = args.resume_from
    elif auto_target_stage == "stage1":
        s1_resume_path = str(auto_resume_ckpt) if auto_resume_ckpt else None
    elif auto_target_stage == "stage2":
        s2_resume_path = str(auto_resume_ckpt) if auto_resume_ckpt else None

    # ── Build model (per-stage) ──────────────────────────────────────
    # ITC Stage-1 needs no Vicuna → build a light model (saves ~13GB VRAM,
    # enables a big contrastive batch). Every other case builds the full
    # model with Vicuna + LoRA, byte-identical to the original flow.
    itc_on = _stage1_itc_cfg(train_cfg) is not None
    if itc_on and run_s1:
        logger.info("Building CXR VLM (light: ITC Stage-1, Vicuna NOT loaded)...")
        model = CXRVisionLanguageModel(model_cfg, load_llm=False, build_itc_head=True)
    else:
        logger.info("Building CXR VLM...")
        model = CXRVisionLanguageModel(model_cfg, load_llm=True, build_itc_head=False)
        if args.resume_from:
            logger.info(f"Resuming from: {args.resume_from}")
            load_checkpoint(model, args.resume_from)

    if run_s1:
        model = run_stage1(
            model, train_cfg, model_cfg, spec, stage1_out, logger,
            tracker     = tracker,
            resume_from = s1_resume_path,
        )

    if run_s2:
        # Stage 2 always needs the full model. If Stage 1 built a light
        # (no-Vicuna) model, free it and rebuild the full one; the stage1
        # projection is then seeded from the checkpoint just below.
        if getattr(model, "llm", None) is None:
            logger.info("Rebuilding full CXR VLM (with Vicuna) for Stage 2...")
            import gc
            del model
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            model = CXRVisionLanguageModel(model_cfg, load_llm=True, build_itc_head=False)
            if args.resume_from and not s2_resume_path:
                load_checkpoint(model, args.resume_from)
        # Load stage1 projection weights before stage2 if available.
        # Priority:
        #   1. Just finished stage1 in this run → use stage1_out/stage1_final.pt
        #   2. Not running stage1 but stage1_final.pt exists on disk → load it
        #   3. s2_resume_path set (we're mid-stage2) → Trainer will reload from
        #      the checkpoint itself; no need to seed stage1 weights here.
        #   4. Nothing → warn loudly; stage2 starts with random projection.
        stage1_ckpt = Path(stage1_out) / "stage1_final.pt"
        if run_s1:
            load_checkpoint(model, str(stage1_ckpt))
            logger.info(f"Loaded stage1 weights from this run: {stage1_ckpt}")
        elif stage1_ckpt.exists() and not s2_resume_path:
            load_checkpoint(model, str(stage1_ckpt))
            logger.info(f"Auto-loaded existing stage1 weights: {stage1_ckpt}")
        elif not s2_resume_path:
            logger.warning(
                "⚠ No stage1 weights found and not resuming. Projection layer "
                "will start RANDOMLY for stage2. Expect degraded convergence. "
                f"Looked at: {stage1_ckpt}"
            )

        model = run_stage2(
            model, train_cfg, model_cfg, spec, stage2_out, logger,
            resume_from = s2_resume_path,
            tracker     = tracker,
        )

    # Upload final results folder (predictions + metrics) if evaluate.py has run
    if tracker is not None:
        results_dir = Path("results") / run_id
        if results_dir.exists():
            tracker.upload_folder(str(results_dir), "results")
        tracker.write_meta({"training_complete": True})

    logger.info("Training complete!")


if __name__ == "__main__":
    main()