Spaces:

sreshwarprasad
/

eshwar-gz2-api

Running

App Files Files Community

sreshwarprasad commited on 5 days ago

Commit

e36eee4

verified ·

1 Parent(s): 0ebfc32

Upload folder using huggingface_hub

Browse files

Files changed (37) hide show

Dockerfile +16 -0
app.py +127 -0
best_full_train.pt +3 -0
configs/ablation.yaml +25 -0
configs/base.yaml +64 -0
configs/full_train.yaml +26 -0
configs/subset_60k.yaml +25 -0
requirements.txt +11 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/ablation.cpython-310.pyc +0 -0
src/__pycache__/attention_viz.cpython-310.pyc +0 -0
src/__pycache__/baselines.cpython-310.pyc +0 -0
src/__pycache__/baselines.cpython-312.pyc +0 -0
src/__pycache__/dataset.cpython-310.pyc +0 -0
src/__pycache__/evaluate_full.cpython-310.pyc +0 -0
src/__pycache__/evaluate_full.cpython-312.pyc +0 -0
src/__pycache__/loss.cpython-310.pyc +0 -0
src/__pycache__/metrics.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-312.pyc +0 -0
src/__pycache__/train.cpython-310.pyc +0 -0
src/__pycache__/train_single.cpython-310.pyc +0 -0
src/__pycache__/train_single.cpython-312.pyc +0 -0
src/__pycache__/uncertainty_analysis.cpython-310.pyc +0 -0
src/ablation.py +258 -0
src/attention_viz.py +316 -0
src/baselines.py +844 -0
src/dataset.py +270 -0
src/evaluate_full.py +619 -0
src/loss.py +155 -0
src/metrics.py +335 -0
src/model.py +316 -0
src/train.py +327 -0
src/train_single.py +419 -0
src/uncertainty_analysis.py +578 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.10-slim
+WORKDIR /code
+RUN apt-get update && apt-get install -y \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . /code/
+# Run the FastAPI server on port 7860 (Hugging Face Spaces default)
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import sys
+import os
+import io
+import base64
+import numpy as np
+from PIL import Image
+import torch
+from fastapi import FastAPI, File, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Add current directory to path so HF Space finds it
+import sys
+import os
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+from omegaconf import OmegaConf
+from src.model import build_model
+from src.attention_viz import attention_rollout_full, make_overlay
+from src.dataset import QUESTION_GROUPS
+from torchvision import transforms
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = None
+cfg = None
+transform = None
+@app.on_event("startup")
+def load_model():
+    global model, cfg, transform
+    print("Loading configuration...")
+    base_cfg = OmegaConf.load(os.path.join(current_dir, "configs/base.yaml"))
+    # We load the full train config
+    try:
+        exp_cfg = OmegaConf.load(os.path.join(current_dir, "configs/full_train.yaml"))
+        cfg = OmegaConf.merge(base_cfg, exp_cfg)
+    except:
+        cfg = base_cfg
+    print("Building model...")
+    model = build_model(cfg).to(device)
+    ckpt_path = os.path.join(current_dir, "best_full_train.pt")
+    if os.path.exists(ckpt_path):
+        print(f"Loading checkpoint from {ckpt_path}")
+        ckpt = torch.load(ckpt_path, map_location=device, weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+    else:
+        print(f"WARNING: Checkpoint not found at {ckpt_path}")
+    model.eval()
+    # Galaxy Zoo image transform: resize, crop, center, normalize
+    # Assuming standard Imagenet + ViT transforms for 224x224
+    transform = transforms.Compose([
+        transforms.Resize(224),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+@app.post("/api/predict")
+async def predict(file: UploadFile = File(...)):
+    contents = await file.read()
+    image = Image.open(io.BytesIO(contents)).convert("RGB")
+    # Transform image
+    img_tensor = transform(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        with torch.amp.autocast("cuda", enabled=True):
+            logits = model(img_tensor)
+        # Get attention weights
+        layers = model.get_all_attention_weights()
+    # Process predictions mapping
+    predictions = logits[0].cpu().numpy()
+    results = {}
+    # In proper evaluation, hierarchical softmax is applied per question group
+    import torch.nn.functional as F
+    probs = logits.detach().cpu().clone()
+    for q_name, (start, end) in QUESTION_GROUPS.items():
+        probs[:, start:end] = F.softmax(probs[:, start:end], dim=-1)
+    probs_np = probs[0].numpy()
+    for q_name, (start, end) in QUESTION_GROUPS.items():
+        results[q_name] = probs_np[start:end].tolist()
+    # Generate Attention Heatmap Overlay
+    if layers is not None:
+        # attention_rollout_full expects list of [B, H, N+1, N+1]
+        all_layer_attns = [l.cpu() for l in layers]
+        rollout_map = attention_rollout_full(all_layer_attns, patch_size=16, image_size=224)[0]
+        # original image numpy for overlay (denormalised size)
+        original_img_np = np.array(image.resize((224, 224)))
+        overlay = make_overlay(original_img_np, rollout_map, alpha=0.5, colormap="inferno")
+        # Encode to base64
+        overlay_img = Image.fromarray(overlay)
+        buffered = io.BytesIO()
+        overlay_img.save(buffered, format="PNG")
+        heatmap_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    else:
+        heatmap_base64 = None
+    return {
+        "predictions": results,
+        "heatmap": heatmap_base64
+    }

best_full_train.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f31287bca388d29f3b144a9a407f041b4f02c6742ece7614751aab70cd6f04ea
+size 343371682

configs/ablation.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# ─────────────────────────────────────────────────────────────
+# configs/ablation.yaml
+# Phase 1: lambda_kl ablation on a 10k subset.
+# Run FIRST before any full training.
+# ─────────────────────────────────────────────────────────────
+defaults:
+  - base
+experiment_name : "ablation"
+data:
+  n_samples : 10000    # ablation uses 10k for speed
+training:
+  epochs : 15          # sufficient to converge on 10k
+scheduler:
+  T_max : 15
+early_stopping:
+  patience : 5
+wandb:
+  log_attention_every_n_epochs : 99   # disable attention in ablation

configs/base.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+# ─────────────────────────────────────────────────────────────
+# configs/base.yaml
+# Base configuration for all experiments.
+# All experiment configs inherit and override from this file.
+# ─────────────────────────────────────────────────────────────
+project_name    : "gz2-hierarchical-vit"
+experiment_name : "base"
+seed            : 42
+data:
+  parquet_path       : "data/labels.parquet"
+  image_dir          : "data/images"
+  image_id_col       : "dr7objid"
+  image_size         : 224
+  n_samples          : null          # null = full dataset
+  train_frac         : 0.80
+  val_frac           : 0.10
+  test_frac          : 0.10
+  num_workers        : 12
+  pin_memory         : true
+  persistent_workers : true
+  prefetch_factor    : 4
+model:
+  backbone   : "vit_base_patch16_224"
+  pretrained : true
+  # FIXED: increased from 0.1 → 0.3 to reduce overfitting on 86M-param model.
+  # Loss curves showed train/val divergence from epoch ~12 with dropout=0.1.
+  dropout    : 0.3
+loss:
+  lambda_kl  : 0.5     # weight of KL divergence term
+  lambda_mse : 0.5     # weight of MSE term
+  epsilon    : 1.0e-8  # numerical stability clamp
+training:
+  epochs          : 100
+  batch_size      : 64
+  learning_rate   : 1.0e-4
+  weight_decay    : 1.0e-4
+  grad_clip       : 1.0
+  mixed_precision : true
+early_stopping:
+  patience  : 10
+  min_delta : 1.0e-5
+  monitor   : "val/loss_total"
+scheduler:
+  name    : "cosine"
+  T_max   : 100
+  eta_min : 1.0e-6
+outputs:
+  checkpoint_dir : "outputs/checkpoints"
+  figures_dir    : "outputs/figures"
+  log_dir        : "outputs/logs"
+wandb:
+  enabled                      : true
+  project                      : "gz2-hierarchical-vit"   # new project name
+  log_attention_every_n_epochs : 5
+  n_attention_samples          : 8

configs/full_train.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# ─────────────────────────────────────────────────────────────
+# configs/full_train.yaml
+# Phase 3: Full training on complete 239k dataset.
+# Run after ablation confirms lambda_kl = 0.5 is optimal.
+# ─────────────────────────────────────────────────────────────
+defaults:
+  - base
+experiment_name : "full_train"
+data:
+  n_samples : null     # full 239k dataset
+training:
+  epochs     : 100
+  batch_size : 64
+scheduler:
+  T_max : 100
+early_stopping:
+  patience : 10
+wandb:
+  log_attention_every_n_epochs : 5

configs/subset_60k.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# ─────────────────────────────────────────────────────────────
+# configs/subset_60k.yaml
+# Phase 2: sanity check / quick prototype on 60k subset.
+# Use for code verification before full training.
+# ─────────────────────────────────────────────────────────────
+defaults:
+  - base
+experiment_name : "subset_60k"
+data:
+  n_samples : 60000    # 60k random galaxies
+training:
+  epochs : 30
+scheduler:
+  T_max : 30
+early_stopping:
+  patience : 7
+wandb:
+  log_attention_every_n_epochs : 5

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn
+python-multipart
+pydantic
+torch
+torchvision
+numpy
+Pillow
+omegaconf
+timm
+opencv-python-headless

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (128 Bytes). View file

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (132 Bytes). View file

src/__pycache__/ablation.cpython-310.pyc ADDED Viewed

Binary file (7.39 kB). View file

src/__pycache__/attention_viz.cpython-310.pyc ADDED Viewed

Binary file (9.72 kB). View file

src/__pycache__/baselines.cpython-310.pyc ADDED Viewed

Binary file (22.4 kB). View file

src/__pycache__/baselines.cpython-312.pyc ADDED Viewed

Binary file (39.1 kB). View file

src/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (7.46 kB). View file

src/__pycache__/evaluate_full.cpython-310.pyc ADDED Viewed

Binary file (18.2 kB). View file

src/__pycache__/evaluate_full.cpython-312.pyc ADDED Viewed

Binary file (30.7 kB). View file

src/__pycache__/loss.cpython-310.pyc ADDED Viewed

Binary file (5.32 kB). View file

src/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (8.73 kB). View file

src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

src/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (16 kB). View file

src/__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (8.82 kB). View file

src/__pycache__/train_single.cpython-310.pyc ADDED Viewed

Binary file (10.4 kB). View file

src/__pycache__/train_single.cpython-312.pyc ADDED Viewed

Binary file (18.3 kB). View file

src/__pycache__/uncertainty_analysis.cpython-310.pyc ADDED Viewed

Binary file (17.1 kB). View file

src/ablation.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+src/ablation.py
+---------------
+Lambda ablation study for the hierarchical KL + MSE loss.
+Sweeps lambda_kl over [0.0, 0.25, 0.50, 0.75, 1.0] on a 10k subset
+to justify the choice of lambda_kl = 0.5 used in the proposed model.
+This ablation is reported in the paper as justification for the
+balanced KL + MSE formulation. It is run BEFORE full training.
+Output
+------
+outputs/figures/ablation/table_lambda_ablation.csv
+outputs/figures/ablation/fig_lambda_ablation.pdf
+outputs/figures/ablation/fig_lambda_ablation.png
+Usage
+-----
+    cd ~/galaxy
+    nohup python -m src.ablation --config configs/ablation.yaml \
+        > outputs/logs/ablation.log 2>&1 &
+    echo "PID: $!"
+"""
+import argparse
+import copy
+import logging
+import random
+import sys
+import gc
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from torch.amp import autocast, GradScaler
+from omegaconf import OmegaConf, DictConfig
+from tqdm import tqdm
+from src.dataset import build_dataloaders
+from src.model   import build_model
+from src.loss    import HierarchicalLoss
+from src.metrics import compute_metrics, predictions_to_numpy
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("ablation")
+LAMBDA_VALUES    = [0.0, 0.25, 0.50, 0.75, 1.0]
+ABLATION_EPOCHS  = 15      # sufficient to converge on 10k subset
+ABLATION_SAMPLES = 10000
+def _set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def run_single(cfg: DictConfig, lambda_kl: float) -> dict:
+    """
+    Train one model with the given lambda_kl on a 10k subset and
+    return test metrics. All other settings are identical across runs.
+    """
+    _set_seed(cfg.seed)
+    cfg = copy.deepcopy(cfg)
+    cfg.loss.lambda_kl  = lambda_kl
+    cfg.loss.lambda_mse = 1.0 - lambda_kl
+    cfg.data.n_samples  = ABLATION_SAMPLES
+    cfg.training.epochs = ABLATION_EPOCHS
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    train_loader, val_loader, test_loader = build_dataloaders(cfg)
+    model   = build_model(cfg).to(device)
+    loss_fn = HierarchicalLoss(cfg)
+    optimizer = torch.optim.AdamW(
+        [
+            {"params": model.backbone.parameters(),
+             "lr": cfg.training.learning_rate * 0.1},
+            {"params": model.head.parameters(),
+             "lr": cfg.training.learning_rate},
+        ],
+        weight_decay=cfg.training.weight_decay,
+    )
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=ABLATION_EPOCHS, eta_min=1e-6
+    )
+    scaler     = GradScaler("cuda")
+    best_val   = float("inf")
+    best_state = None
+    for epoch in range(1, ABLATION_EPOCHS + 1):
+        # ── train ──────────────────────────────────────────────
+        model.train()
+        for images, targets, weights, _ in tqdm(
+            train_loader, desc=f"λ={lambda_kl:.2f} E{epoch}", leave=False
+        ):
+            images  = images.to(device,  non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            optimizer.zero_grad(set_to_none=True)
+            with autocast("cuda", enabled=True):
+                logits = model(images)
+                loss, _ = loss_fn(logits, targets, weights)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            scaler.step(optimizer)
+            scaler.update()
+        scheduler.step()
+        # ── validate ───────────────────────────────────────────
+        model.eval()
+        val_loss = 0.0
+        nb = 0
+        with torch.no_grad():
+            for images, targets, weights, _ in val_loader:
+                images  = images.to(device,  non_blocking=True)
+                targets = targets.to(device, non_blocking=True)
+                weights = weights.to(device, non_blocking=True)
+                with autocast("cuda", enabled=True):
+                    logits = model(images)
+                    loss, _ = loss_fn(logits, targets, weights)
+                val_loss += loss.item()
+                nb += 1
+        val_loss /= nb
+        log.info("  λ_kl=%.2f  epoch=%d  val_loss=%.5f", lambda_kl, epoch, val_loss)
+        if val_loss < best_val:
+            best_val   = val_loss
+            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
+    # ── test evaluation ────────────────────────────────────────
+    model.load_state_dict(best_state)
+    model.eval()
+    all_preds, all_targets, all_weights = [], [], []
+    with torch.no_grad():
+        for images, targets, weights, _ in test_loader:
+            images  = images.to(device,  non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            with autocast("cuda", enabled=True):
+                logits = model(images)
+            p, t, w = predictions_to_numpy(logits, targets, weights)
+            all_preds.append(p)
+            all_targets.append(t)
+            all_weights.append(w)
+    all_preds   = np.concatenate(all_preds)
+    all_targets = np.concatenate(all_targets)
+    all_weights = np.concatenate(all_weights)
+    metrics     = compute_metrics(all_preds, all_targets, all_weights)
+    return {
+        "lambda_kl"    : lambda_kl,
+        "lambda_mse"   : round(1.0 - lambda_kl, 2),
+        "best_val_loss": round(best_val, 5),
+        "mae_weighted" : round(metrics["mae/weighted_avg"],  5),
+        "rmse_weighted": round(metrics["rmse/weighted_avg"], 5),
+        "ece_mean"     : round(metrics["ece/mean"],          5),
+    }
+def _plot_ablation(df: pd.DataFrame, save_dir: Path):
+    best_row = df.loc[df["mae_weighted"].idxmin()]
+    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
+    metrics_cfg = [
+        ("mae_weighted",  "Weighted MAE",  "#2980b9"),
+        ("rmse_weighted", "Weighted RMSE", "#c0392b"),
+        ("ece_mean",      "Mean ECE",      "#27ae60"),
+    ]
+    for ax, (col, ylabel, color) in zip(axes, metrics_cfg):
+        ax.plot(df["lambda_kl"], df[col], "-o", color=color,
+                linewidth=2, markersize=8)
+        ax.axvline(best_row["lambda_kl"], color="#7f8c8d",
+                   linestyle="--", alpha=0.8,
+                   label=f"Best λ = {best_row['lambda_kl']:.2f}")
+        ax.set_xlabel("$\\lambda_{\\mathrm{KL}}$  "
+                      "(0 = pure MSE, 1 = pure KL)", fontsize=11)
+        ax.set_ylabel(ylabel, fontsize=11)
+        ax.set_title(f"Lambda ablation — {ylabel}", fontsize=10)
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.3)
+        ax.set_xticks(df["lambda_kl"].tolist())
+    plt.suptitle(
+        "Ablation study: effect of $\\lambda_{\\mathrm{KL}}$ in the hierarchical loss\n"
+        f"10,000-sample subset, seed=42. Best: $\\lambda_{{\\mathrm{{KL}}}}$"
+        f" = {best_row['lambda_kl']:.2f}  (MAE = {best_row['mae_weighted']:.5f})",
+        fontsize=11, y=1.02,
+    )
+    plt.tight_layout()
+    fig.savefig(save_dir / "fig_lambda_ablation.pdf", dpi=300, bbox_inches="tight")
+    fig.savefig(save_dir / "fig_lambda_ablation.png", dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_lambda_ablation")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    save_dir = Path(cfg.outputs.figures_dir) / "ablation"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    results = []
+    for lam in LAMBDA_VALUES:
+        log.info("=" * 55)
+        log.info("Ablation: lambda_kl=%.2f  lambda_mse=%.2f",
+                 lam, 1.0 - lam)
+        log.info("=" * 55)
+        result = run_single(cfg, lam)
+        results.append(result)
+        log.info("Result: %s", result)
+        # Free up RAM and GPU memory
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    df = pd.DataFrame(results)
+    df.to_csv(save_dir / "table_lambda_ablation.csv", index=False)
+    log.info("Saved: table_lambda_ablation.csv")
+    print()
+    print(df.to_string(index=False))
+    print()
+    best = df.loc[df["mae_weighted"].idxmin()]
+    log.info("Best: lambda_kl=%.2f  MAE=%.5f  RMSE=%.5f",
+             best["lambda_kl"], best["mae_weighted"], best["rmse_weighted"])
+    _plot_ablation(df, save_dir)
+if __name__ == "__main__":
+    main()

src/attention_viz.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+src/attention_viz.py
+--------------------
+Full multi-layer attention rollout for ViT explainability.
+Theory — Abnar & Zuidema (2020)
+--------------------------------
+Each ViT transformer block l produces attention weights A_l of shape
+[B, H, N+1, N+1], where H=12 heads and N+1=197 tokens (196 patches
++ 1 CLS token).
+Full rollout algorithm:
+    1. Average over heads: A_l = mean_h(attn_l)          [B, N+1, N+1]
+    2. Add residual:       A_l = 0.5*A_l + 0.5*I         [B, N+1, N+1]
+    3. Row-normalise so attention sums to 1 per token.
+    4. Chain layers:       R = A_1 ⊗ A_2 ⊗ ... ⊗ A_12   [B, N+1, N+1]
+    5. CLS row, patch cols: rollout = R[:, 0, 1:]         [B, 196]
+    6. Reshape 196 → 14×14, upsample to 224×224.
+FIX applied vs. original
+--------------------------
+The original code used  R = bmm(A, R)  (left-multiplication) which
+accumulates attention in reverse order. The correct propagation per
+Abnar & Zuidema is  R = bmm(R, A)  (right-multiplication), which
+tracks how information from the INPUT patches flows forward through
+successive layers into the CLS token.
+Entropy interpretation
+-----------------------
+CLS attention entropy INCREASES from early to late layers. This is
+the expected and correct behaviour for ViT classification:
+  - Early layers (1–8): entropy is low and stable (~1.7–2.0 nats),
+    consistent with local morphological feature detection.
+  - Late layers (9–12): entropy rises sharply (~2.7–4.5 nats),
+    consistent with the CLS token performing global integration —
+    aggregating information from all patches before the regression head.
+This pattern confirms that early layers specialise in local structure
+while late layers globally aggregate morphological information for
+the final prediction.
+References
+----------
+Abnar & Zuidema (2020). Quantifying Attention Flow in Transformers.
+    ACL 2020. https://arxiv.org/abs/2005.00928
+"""
+from __future__ import annotations
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from pathlib import Path
+from typing import Optional, List
+# ─────────────────────────────────────────────────────────────
+# Full multi-layer rollout  (FIXED)
+# ─────────────────────────────────────────────────────────────
+def attention_rollout_full(
+    all_attn_weights: List[torch.Tensor],
+    patch_size: int = 16,
+    image_size: int = 224,
+) -> np.ndarray:
+    """
+    Full multi-layer attention rollout per Abnar & Zuidema (2020).
+    Parameters
+    ----------
+    all_attn_weights : list of L tensors, each [B, H, N+1, N+1]
+                       One tensor per transformer layer, in order 1 → L.
+    patch_size       : ViT patch size (16 for ViT-Base/16)
+    image_size       : input image size (224)
+    Returns
+    -------
+    rollout_maps : [B, image_size, image_size] float32 in [0, 1]
+    """
+    assert len(all_attn_weights) > 0, "Need at least one attention layer"
+    B, H, N1, _ = all_attn_weights[0].shape
+    device = all_attn_weights[0].device
+    # Identity matrix: R_0 = I
+    R = torch.eye(N1, device=device).unsqueeze(0).expand(B, -1, -1).clone()
+    for attn in all_attn_weights:
+        # Step 1: average over heads → [B, N+1, N+1]
+        A = attn.mean(dim=1)
+        # Step 2: residual connection
+        I = torch.eye(N1, device=device).unsqueeze(0)
+        A = 0.5 * A + 0.5 * I
+        # Step 3: row-normalise
+        A = A / A.sum(dim=-1, keepdim=True).clamp(min=1e-8)
+        # Step 4: chain rollout — FIXED: R = R @ A (right-multiply)
+        # This propagates information forward from input to CLS.
+        # Original had R = A @ R (left-multiply) which is incorrect.
+        R = torch.bmm(R, A)
+    # Step 5: CLS row (index 0), patch columns (1 onwards)
+    cls_attn = R[:, 0, 1:]                          # [B, 196]
+    # Step 6: reshape and upsample to image size
+    grid_size = image_size // patch_size             # 14
+    cls_attn  = cls_attn.reshape(B, 1, grid_size, grid_size)
+    rollout   = F.interpolate(
+        cls_attn, size=(image_size, image_size),
+        mode="bilinear", align_corners=False,
+    ).squeeze(1)                                     # [B, 224, 224]
+    rollout_np = rollout.cpu().numpy()
+    for i in range(B):
+        mn, mx = rollout_np[i].min(), rollout_np[i].max()
+        rollout_np[i] = (rollout_np[i] - mn) / (mx - mn + 1e-8)
+    return rollout_np.astype(np.float32)
+def attention_rollout_single_layer(
+    attn_weights: torch.Tensor,
+    patch_size: int = 16,
+    image_size: int = 224,
+) -> np.ndarray:
+    """Single-layer rollout (backward compatibility). Prefer full rollout."""
+    return attention_rollout_full(
+        [attn_weights], patch_size=patch_size, image_size=image_size
+    )
+# ─────────────────────────────────────────────────────────────
+# Visualisation utilities
+# ─────────────────────────────────────────────────────────────
+def denormalise_image(tensor: torch.Tensor) -> np.ndarray:
+    """Reverse ImageNet normalisation → uint8 [H, W, 3]."""
+    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+    std  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+    img  = tensor.cpu().numpy().transpose(1, 2, 0)
+    img  = np.clip(img * std + mean, 0, 1)
+    return (img * 255).astype(np.uint8)
+def make_overlay(
+    image_np: np.ndarray,
+    rollout:  np.ndarray,
+    alpha:    float = 0.5,
+    colormap: str   = "inferno",
+) -> np.ndarray:
+    """Blend attention heatmap onto galaxy image."""
+    cmap    = cm.get_cmap(colormap)
+    heatmap = (cmap(rollout)[:, :, :3] * 255).astype(np.uint8)
+    overlay = (
+        (1 - alpha) * image_np.astype(np.float32) +
+        alpha       * heatmap.astype(np.float32)
+    ).clip(0, 255).astype(np.uint8)
+    return overlay
+def plot_attention_grid(
+    images:       torch.Tensor,
+    attn_weights,
+    image_ids:    list,
+    save_path:    Optional[str] = None,
+    alpha:        float = 0.5,
+    n_cols:       int   = 4,
+    rollout_mode: str   = "full",
+) -> plt.Figure:
+    """
+    Publication-quality attention rollout gallery.
+    Parameters
+    ----------
+    images       : [N, 3, H, W] galaxy image tensors
+    attn_weights : list of L tensors [N, H, N+1, N+1] (full mode)
+                   or single tensor [N, H, N+1, N+1] (single mode)
+    image_ids    : dr7objid list for panel titles
+    save_path    : optional file path to save the figure
+    alpha        : heatmap opacity (0 = image only, 1 = heatmap only)
+    n_cols       : number of columns in the grid
+    rollout_mode : "full" for 12-layer rollout (recommended)
+    """
+    N = images.shape[0]
+    if rollout_mode == "full" and isinstance(attn_weights, list):
+        rollout_maps = attention_rollout_full(attn_weights)
+    else:
+        if isinstance(attn_weights, list):
+            attn_weights = attn_weights[-1]
+        rollout_maps = attention_rollout_single_layer(attn_weights)
+    n_rows = int(np.ceil(N / n_cols))
+    fig, axes = plt.subplots(
+        n_rows * 2, n_cols,
+        figsize=(n_cols * 3, n_rows * 6),
+        facecolor="black",
+    )
+    axes = axes.flatten()
+    for i in range(N):
+        img_np  = denormalise_image(images[i])
+        overlay = make_overlay(img_np, rollout_maps[i], alpha=alpha)
+        row_base = (i // n_cols) * 2
+        col      = i  % n_cols
+        ax_img   = axes[row_base * n_cols + col]
+        ax_attn  = axes[(row_base + 1) * n_cols + col]
+        ax_img.imshow(img_np)
+        ax_img.axis("off")
+        ax_img.set_title(str(image_ids[i])[-6:], color="white",
+                         fontsize=7, pad=2)
+        ax_attn.imshow(overlay)
+        ax_attn.axis("off")
+    # Hide empty panels
+    for j in range(N, n_rows * n_cols):
+        if j < len(axes):
+            axes[j].axis("off")
+    mode_label = "Full 12-layer rollout" if rollout_mode == "full" else "Last-layer rollout"
+    plt.suptitle(
+        f"Galaxy attention rollout — {mode_label} (ViT-Base/16)",
+        color="white", fontsize=10, y=1.01
+    )
+    plt.tight_layout(pad=0.3)
+    if save_path is not None:
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="black")
+    return fig
+# ─────────────────────────────────────────────────────────────
+# Attention entropy per layer
+# ─────────────────────────────────────────────────────────────
+def compute_attention_entropy_per_layer(
+    all_attn_weights: List[torch.Tensor],
+) -> np.ndarray:
+    """
+    Mean CLS attention entropy per transformer layer.
+    Interpretation
+    --------------
+    Early layers (1–8): low, stable entropy (~1.7–2.0 nats) consistent
+    with local morphological feature detection across patches.
+    Late layers (9–12): rapidly increasing entropy (~2.7–4.5 nats),
+    reflecting the CLS token performing global integration — attending
+    broadly across all patches to aggregate morphological evidence before
+    the regression head. This is the expected behaviour for ViT-class
+    models and is consistent with prior work on ViT attention patterns.
+    Higher entropy ≠ less discriminative. In late layers, broad attention
+    is necessary for global aggregation. The rollout visualisations confirm
+    that the final representation correctly emphasises morphological
+    structure (spiral arms, bulge, disk) despite diffuse raw attention.
+    Returns
+    -------
+    entropies : [L] mean entropy per layer in nats
+    """
+    entropies = []
+    for attn in all_attn_weights:
+        # CLS token attention to patches: [B, H, N_patches]
+        cls_attn = attn[:, :, 0, 1:].clamp(min=1e-9)
+        ent = -(cls_attn * cls_attn.log()).sum(dim=-1)   # [B, H]
+        entropies.append(ent.mean().item())
+    return np.array(entropies, dtype=np.float32)
+def plot_attention_entropy(
+    all_attn_weights: List[torch.Tensor],
+    save_path: Optional[str] = None,
+) -> plt.Figure:
+    """
+    Plot CLS attention entropy per transformer layer with correct interpretation.
+    """
+    entropies = compute_attention_entropy_per_layer(all_attn_weights)
+    L = len(entropies)
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.plot(range(1, L + 1), entropies, "b-o", markersize=6, linewidth=2)
+    # Shade regions for interpretation
+    ax.axvspan(1, 8.5, alpha=0.07, color="blue",
+               label="Local feature detection (layers 1–8)")
+    ax.axvspan(8.5, L + 0.5, alpha=0.07, color="orange",
+               label="Global integration (layers 9–12)")
+    ax.set_xlabel("Transformer layer", fontsize=12)
+    ax.set_ylabel("Mean CLS attention entropy (nats)", fontsize=12)
+    ax.set_title(
+        "CLS token attention entropy vs. transformer depth\n"
+        "Early layers: local morphological detection  |  "
+        "Late layers: global aggregation",
+        fontsize=10,
+    )
+    ax.set_xticks(range(1, L + 1))
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+    return fig

src/baselines.py ADDED Viewed

	@@ -0,0 +1,844 @@

+"""
+src/baselines.py
+----------------
+Consolidated baseline training for the GZ2 hierarchical probabilistic
+regression paper. ALL baselines are trained from this single script.
+Replaces the three separate scripts:
+    src/baselines.py      (was: ResNet-18 MSE + ViT MSE)
+    src/run_resnet_kl.py  (was: ResNet-18 KL+MSE — now merged here)
+    src/train_dirichlet.py (was: ViT Dirichlet — now merged here)
+DELETE those three original files after switching to this one.
+Baselines trained
+-----------------
+B1. ResNet-18 + independent MSE (sigmoid)
+      — CNN, no hierarchy, no KL. Demonstrates the cost of
+        ignoring the decision-tree structure.
+B2. ResNet-18 + hierarchical KL+MSE
+      — Same loss as proposed, CNN backbone.
+        Isolates ViT vs. CNN contribution.
+B3. ViT-Base + hierarchical MSE only (no KL)
+      — Same backbone as proposed, KL term removed.
+        Isolates contribution of the KL term.
+B4. ViT-Base + Dirichlet NLL (Zoobot-style)
+      — Direct comparison with the established Zoobot approach
+        (Walmsley et al. 2022, MNRAS 509, 3966).
+Proposed model (not trained here — trained via src/train.py):
+    ViT-Base + hierarchical KL+MSE  →  outputs/checkpoints/best_full_train.pt
+Consistency guarantee
+---------------------
+All baselines use identical:
+  - Random seed, data split, batch size, epochs, early stopping
+  - AdamW optimiser, CosineAnnealingLR, gradient clipping
+  - Image transforms and evaluation metric (compute_metrics on same test split)
+The ONLY differences between models are the backbone and/or loss function.
+Usage
+-----
+    cd ~/galaxy
+    nohup python -m src.baselines --config configs/full_train.yaml \
+        > outputs/logs/baselines.log 2>&1 &
+    echo "PID: $!"
+"""
+import argparse
+import logging
+import random
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import timm
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from torch.amp import autocast, GradScaler
+from omegaconf import OmegaConf
+from tqdm import tqdm
+import wandb
+from src.dataset  import build_dataloaders, QUESTION_GROUPS
+from src.loss     import HierarchicalLoss, DirichletLoss, MSEOnlyLoss
+from src.metrics  import (compute_metrics, predictions_to_numpy,
+                           dirichlet_predictions_to_numpy, simplex_violation_rate)
+from src.model    import build_model, build_dirichlet_model
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("baselines")
+QUESTION_LABELS = {
+    "t01": "Smooth or features", "t02": "Edge-on disk",
+    "t03": "Bar",                 "t04": "Spiral arms",
+    "t05": "Bulge prominence",    "t06": "Odd feature",
+    "t07": "Roundedness",         "t08": "Odd feature type",
+    "t09": "Bulge shape",         "t10": "Arms winding",
+    "t11": "Arms number",
+}
+# ─────────────────────────────────────────────────────────────
+# Reproducibility
+# ─────────────────────────────────────────────────────────────
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark     = False
+# ─────────────────────────────────────────────────────────────
+# Early stopping (mirrors train.py exactly)
+# ─────────────────────────────────────────────────────────────
+class EarlyStopping:
+    def __init__(self, patience, min_delta, checkpoint_path):
+        self.patience        = patience
+        self.min_delta       = min_delta
+        self.checkpoint_path = checkpoint_path
+        self.best_loss       = float("inf")
+        self.counter         = 0
+        self.best_epoch      = 0
+    def step(self, val_loss, model, epoch) -> bool:
+        if val_loss < self.best_loss - self.min_delta:
+            self.best_loss  = val_loss
+            self.counter    = 0
+            self.best_epoch = epoch
+            torch.save(
+                {"epoch": epoch, "model_state": model.state_dict(),
+                 "val_loss": val_loss},
+                self.checkpoint_path,
+            )
+            log.info("  [ckpt] saved  val_loss=%.6f  epoch=%d", val_loss, epoch)
+        else:
+            self.counter += 1
+            log.info("  [early_stop] %d/%d  best=%.6f",
+                     self.counter, self.patience, self.best_loss)
+        return self.counter >= self.patience
+    def restore_best(self, model) -> float:
+        ckpt = torch.load(self.checkpoint_path, map_location="cpu",
+                          weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+        log.info("Restored best weights  epoch=%d  val_loss=%.6f",
+                 ckpt["epoch"], ckpt["val_loss"])
+        return ckpt["val_loss"]
+# ─────────────────────────────────────────────────────────────
+# Baseline Model 1: ResNet-18 + independent MSE
+# ─────────────────────────────────────────────────────────────
+class ResNet18Baseline(nn.Module):
+    """
+    ResNet-18 pretrained on ImageNet with a dropout + linear head.
+    Used for both the sigmoid-MSE baseline and the KL+MSE baseline.
+    """
+    def __init__(self, dropout: float = 0.3):
+        super().__init__()
+        self.backbone = timm.create_model(
+            "resnet18", pretrained=True, num_classes=0
+        )
+        self.head = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(self.backbone.num_features, 37),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.head(self.backbone(x))
+class IndependentMSELoss(nn.Module):
+    """
+    Plain MSE over all 37 targets independently.
+    No hierarchical weighting, no KL divergence.
+    Sigmoid applied to predictions before MSE to constrain range [0,1].
+    Note: predictions do NOT sum to 1 per question group by construction.
+    This is documented and the simplex_violation_rate metric quantifies
+    this invalidity to allow fair comparison with the proposed method.
+    """
+    def forward(self, predictions, targets, weights):
+        pred_prob = torch.sigmoid(predictions)
+        loss = F.mse_loss(pred_prob, targets)
+        return loss, {"loss/total": loss.detach().item()}
+# ─────────────────────────────────────────────────────────────
+# Shared training loop
+# ─────────────────────────────────────────────────────────────
+def _train_epoch(model, loader, loss_fn, optimizer, scaler,
+                 device, cfg, epoch, label):
+    model.train()
+    total = 0.0
+    nb    = 0
+    for images, targets, weights, _ in tqdm(
+        loader, desc=f"{label} E{epoch}", leave=False
+    ):
+        images  = images.to(device, non_blocking=True)
+        targets = targets.to(device, non_blocking=True)
+        weights = weights.to(device, non_blocking=True)
+        optimizer.zero_grad(set_to_none=True)
+        with autocast("cuda", enabled=cfg.training.mixed_precision):
+            logits      = model(images)
+            loss, _     = loss_fn(logits, targets, weights)
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.training.grad_clip)
+        scaler.step(optimizer)
+        scaler.update()
+        total += loss.item()
+        nb    += 1
+    return total / nb
+def _train_epoch_dirichlet(model, loader, loss_fn, optimizer, scaler,
+                            device, cfg, epoch, label):
+    """Training epoch for Dirichlet model (outputs alpha, not logits)."""
+    model.train()
+    total = 0.0
+    nb    = 0
+    for images, targets, weights, _ in tqdm(
+        loader, desc=f"{label} E{epoch}", leave=False
+    ):
+        images  = images.to(device, non_blocking=True)
+        targets = targets.to(device, non_blocking=True)
+        weights = weights.to(device, non_blocking=True)
+        optimizer.zero_grad(set_to_none=True)
+        with autocast("cuda", enabled=cfg.training.mixed_precision):
+            alpha       = model(images)
+            loss, _     = loss_fn(alpha, targets, weights)
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.training.grad_clip)
+        scaler.step(optimizer)
+        scaler.update()
+        total += loss.item()
+        nb    += 1
+    return total / nb
+def _val_epoch(model, loader, loss_fn, device, cfg, epoch, label,
+               use_sigmoid=False):
+    model.eval()
+    total = 0.0
+    nb    = 0
+    all_preds, all_targets, all_weights = [], [], []
+    with torch.no_grad():
+        for images, targets, weights, _ in tqdm(
+            loader, desc=f"{label} Val E{epoch}", leave=False
+        ):
+            images  = images.to(device, non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                logits  = model(images)
+                loss, _ = loss_fn(logits, targets, weights)
+            total += loss.item()
+            nb    += 1
+            if use_sigmoid:
+                pred_prob = torch.sigmoid(logits).detach().cpu().numpy()
+            else:
+                pred_cpu = logits.detach().cpu().clone()
+                for q, (s, e) in QUESTION_GROUPS.items():
+                    pred_cpu[:, s:e] = torch.softmax(pred_cpu[:, s:e], dim=-1)
+                pred_prob = pred_cpu.numpy()
+            all_preds.append(pred_prob)
+            all_targets.append(targets.detach().cpu().numpy())
+            all_weights.append(weights.detach().cpu().numpy())
+    all_preds   = np.concatenate(all_preds)
+    all_targets = np.concatenate(all_targets)
+    all_weights = np.concatenate(all_weights)
+    metrics     = compute_metrics(all_preds, all_targets, all_weights)
+    return total / nb, metrics
+def _val_epoch_dirichlet(model, loader, loss_fn, device, cfg, epoch, label):
+    model.eval()
+    total = 0.0
+    nb    = 0
+    all_preds, all_targets, all_weights = [], [], []
+    with torch.no_grad():
+        for images, targets, weights, _ in tqdm(
+            loader, desc=f"{label} Val E{epoch}", leave=False
+        ):
+            images  = images.to(device, non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                alpha   = model(images)
+                loss, _ = loss_fn(alpha, targets, weights)
+            total += loss.item()
+            nb    += 1
+            p, t, w = dirichlet_predictions_to_numpy(alpha, targets, weights)
+            all_preds.append(p)
+            all_targets.append(t)
+            all_weights.append(w)
+    all_preds   = np.concatenate(all_preds)
+    all_targets = np.concatenate(all_targets)
+    all_weights = np.concatenate(all_weights)
+    metrics     = compute_metrics(all_preds, all_targets, all_weights)
+    return total / nb, metrics
+# ─────────────────────────────────────────────────────────────
+# Generic train_and_evaluate (non-Dirichlet)
+# ─────────────────────────────────────────────────────────────
+def train_and_evaluate(
+    model, loss_fn, cfg, device,
+    label, checkpoint_path,
+    use_layerwise_lr=True,
+    use_sigmoid=False,
+):
+    """
+    Full training loop consistent with train.py.
+    Returns (test_metrics, best_val_loss, best_epoch, history).
+    If checkpoint exists, loads it and skips training.
+    """
+    # Check if checkpoint exists - if so, skip training
+    if Path(checkpoint_path).exists():
+        log.info("%s: checkpoint found - loading and skipping training", label)
+        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+        best_epoch = ckpt.get("epoch", 0)
+        best_val = ckpt.get("val_loss", float("inf"))
+        log.info("Restored: epoch=%d, val_loss=%.6f", best_epoch, best_val)
+        # Evaluate on test set
+        _, _, test_loader = build_dataloaders(cfg)
+        _, test_metrics = _val_epoch(
+            model, test_loader, loss_fn, device, cfg,
+            epoch=0, label=f"{label}-test", use_sigmoid=use_sigmoid
+        )
+        return test_metrics, best_val, best_epoch, []
+    train_loader, val_loader, test_loader = build_dataloaders(cfg)
+    if use_layerwise_lr and hasattr(model, "backbone") and hasattr(model, "head"):
+        optimizer = torch.optim.AdamW(
+            [
+                {"params": model.backbone.parameters(),
+                 "lr": cfg.training.learning_rate * 0.1},
+                {"params": model.head.parameters(),
+                 "lr": cfg.training.learning_rate},
+            ],
+            weight_decay=cfg.training.weight_decay,
+        )
+        log.info("%s: layer-wise lr — backbone=%.1e  head=%.1e",
+                 label, cfg.training.learning_rate * 0.1, cfg.training.learning_rate)
+    else:
+        optimizer = torch.optim.AdamW(
+            model.parameters(),
+            lr=cfg.training.learning_rate,
+            weight_decay=cfg.training.weight_decay,
+        )
+        log.info("%s: single lr=%.1e", label, cfg.training.learning_rate)
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=cfg.scheduler.T_max, eta_min=cfg.scheduler.eta_min
+    )
+    scaler     = GradScaler("cuda")
+    early_stop = EarlyStopping(
+        patience=cfg.early_stopping.patience,
+        min_delta=cfg.early_stopping.min_delta,
+        checkpoint_path=checkpoint_path,
+    )
+    wandb.init(
+        project=cfg.wandb.project,
+        name=label,
+        config={
+            "model": label, "backbone": "resnet18" if "ResNet" in label else "vit_base_patch16_224",
+            "batch_size": cfg.training.batch_size, "lr": cfg.training.learning_rate,
+            "epochs": cfg.training.epochs, "seed": cfg.seed,
+            "lambda_kl": cfg.loss.lambda_kl, "lambda_mse": cfg.loss.lambda_mse,
+        },
+        reinit=True,
+    )
+    history = []
+    for epoch in range(1, cfg.training.epochs + 1):
+        train_loss = _train_epoch(
+            model, train_loader, loss_fn, optimizer, scaler, device, cfg, epoch, label
+        )
+        val_loss, val_metrics = _val_epoch(
+            model, val_loader, loss_fn, device, cfg, epoch, label,
+            use_sigmoid=use_sigmoid
+        )
+        scheduler.step()
+        lr = scheduler.get_last_lr()[0]
+        val_mae = val_metrics.get("mae/weighted_avg", 0)
+        log.info("%s  epoch=%d  train=%.4f  val=%.4f  mae=%.4f  lr=%.2e",
+                 label, epoch, train_loss, val_loss, val_mae, lr)
+        history.append({
+            "epoch": epoch, "train_loss": train_loss,
+            "val_loss": val_loss, "val_mae": val_mae,
+        })
+        wandb.log({
+            "train_loss": train_loss, "val_loss": val_loss,
+            "val_mae": val_mae, "lr": lr,
+        }, step=epoch)
+        if early_stop.step(val_loss, model, epoch):
+            log.info("%s: early stopping at epoch %d  best=%d",
+                     label, epoch, early_stop.best_epoch)
+            break
+    best_val = early_stop.restore_best(model)
+    wandb.finish()
+    log.info("%s: evaluating on test set...", label)
+    _, test_metrics = _val_epoch(
+        model, test_loader, loss_fn, device, cfg,
+        epoch=0, label=f"{label}-test", use_sigmoid=use_sigmoid
+    )
+    return test_metrics, best_val, early_stop.best_epoch, history
+# ─────────────────────────────────────────────────────────────
+# Dirichlet train_and_evaluate
+# ─────────────────────────────────────────────────────────────
+def train_and_evaluate_dirichlet(model, loss_fn, cfg, device,
+                                  label, checkpoint_path):
+    """Training loop for Dirichlet model. Skips training if checkpoint exists."""
+    # Check if checkpoint exists - if so, skip training
+    if Path(checkpoint_path).exists():
+        log.info("%s: checkpoint found - loading and skipping training", label)
+        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+        best_epoch = ckpt.get("epoch", 0)
+        best_val = ckpt.get("val_loss", float("inf"))
+        log.info("Restored: epoch=%d, val_loss=%.6f", best_epoch, best_val)
+        # Evaluate on test set
+        _, _, test_loader = build_dataloaders(cfg)
+        _, test_metrics = _val_epoch_dirichlet(
+            model, test_loader, loss_fn, device, cfg,
+            epoch=0, label=f"{label}-test"
+        )
+        return test_metrics, best_val, best_epoch, []
+    train_loader, val_loader, test_loader = build_dataloaders(cfg)
+    optimizer = torch.optim.AdamW(
+        [
+            {"params": model.backbone.parameters(),
+             "lr": cfg.training.learning_rate * 0.1},
+            {"params": model.head.parameters(),
+             "lr": cfg.training.learning_rate},
+        ],
+        weight_decay=cfg.training.weight_decay,
+    )
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=cfg.scheduler.T_max, eta_min=cfg.scheduler.eta_min
+    )
+    scaler     = GradScaler("cuda")
+    early_stop = EarlyStopping(
+        patience=cfg.early_stopping.patience,
+        min_delta=cfg.early_stopping.min_delta,
+        checkpoint_path=checkpoint_path,
+    )
+    wandb.init(
+        project=cfg.wandb.project, name=label,
+        config={"model": label, "loss": "DirichletNLL",
+                "seed": cfg.seed, "epochs": cfg.training.epochs},
+        reinit=True,
+    )
+    history = []
+    for epoch in range(1, cfg.training.epochs + 1):
+        train_loss = _train_epoch_dirichlet(
+            model, train_loader, loss_fn, optimizer, scaler, device, cfg, epoch, label
+        )
+        val_loss, val_metrics = _val_epoch_dirichlet(
+            model, val_loader, loss_fn, device, cfg, epoch, label
+        )
+        scheduler.step()
+        lr = scheduler.get_last_lr()[0]
+        val_mae = val_metrics.get("mae/weighted_avg", 0)
+        log.info("%s  epoch=%d  train=%.4f  val=%.4f  mae=%.4f  lr=%.2e",
+                 label, epoch, train_loss, val_loss, val_mae, lr)
+        history.append({
+            "epoch": epoch, "train_loss": train_loss,
+            "val_loss": val_loss, "val_mae": val_mae,
+        })
+        wandb.log({
+            "train_loss": train_loss, "val_loss": val_loss,
+            "val_mae": val_mae, "lr": lr,
+        }, step=epoch)
+        if early_stop.step(val_loss, model, epoch):
+            log.info("%s: early stopping at epoch %d", label, epoch)
+            break
+    best_val = early_stop.restore_best(model)
+    wandb.finish()
+    log.info("%s: evaluating on test set...", label)
+    _, test_metrics = _val_epoch_dirichlet(
+        model, test_loader, loss_fn, device, cfg, epoch=0, label=f"{label}-test"
+    )
+    return test_metrics, best_val, early_stop.best_epoch, history
+# ─────────────────────────────────────────────────────────────
+# Figures
+# ─────────────────────────────────────────────────────────────
+def _save_comparison_figures(all_results, all_histories, save_dir):
+    """
+    Saves:
+      1. Per-question MAE + RMSE bar chart
+      2. Validation MAE learning curves
+      3. Simplex violation table for sigmoid baseline
+    All figure names follow IEEE journal conventions.
+    """
+    q_names = list(QUESTION_GROUPS.keys())
+    n_models = len(all_results)
+    x        = np.arange(len(q_names))
+    width    = 0.80 / n_models
+    palette  = ["#c0392b", "#e67e22", "#2980b9", "#27ae60", "#8e44ad"]
+    # ── Figure 1: Per-question MAE and RMSE ───────────────────
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    for metric, ax, ylabel in [
+        ("mae",  axes[0], "Mean Absolute Error (MAE)"),
+        ("rmse", axes[1], "Root Mean Squared Error (RMSE)"),
+    ]:
+        for i, (row_d, color) in enumerate(zip(all_results, palette)):
+            vals = [row_d.get(f"{metric}_{q}", np.nan) for q in q_names]
+            ax.bar(x + i * width, vals, width,
+                   label=row_d["model"], color=color,
+                   alpha=0.85, edgecolor="white", linewidth=0.5)
+        ax.set_xticks(x + width * (n_models - 1) / 2)
+        ax.set_xticklabels(
+            [f"{q}\n({QUESTION_LABELS[q][:10]})" for q in q_names],
+            rotation=45, ha="right", fontsize=7,
+        )
+        ax.set_ylabel(ylabel, fontsize=11)
+        ax.set_title(f"Per-question {metric.upper()} — baseline comparison", fontsize=11)
+        ax.legend(fontsize=7, loc="upper right")
+        ax.grid(True, alpha=0.3, axis="y")
+        ax.set_axisbelow(True)
+    plt.suptitle(
+        "Baseline comparison — GZ2 hierarchical probabilistic regression\n"
+        "Full 239,267-sample dataset, identical seed/split/protocol",
+        fontsize=12, y=1.02,
+    )
+    plt.tight_layout()
+    fig.savefig(save_dir / "fig_baseline_comparison_mae_rmse.pdf",
+                dpi=300, bbox_inches="tight")
+    fig.savefig(save_dir / "fig_baseline_comparison_mae_rmse.png",
+                dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_baseline_comparison_mae_rmse")
+    # ── Figure 2: Validation MAE learning curves ───────────────
+    fig2, ax2 = plt.subplots(figsize=(10, 5))
+    styles  = ["-", "--", "-.", ":", (0, (3, 1, 1, 1))]
+    markers = ["o", "s", "^", "D", "v"]
+    for (name, hist), ls, color, mk in zip(
+        all_histories.items(), styles, palette, markers
+    ):
+        epochs_h = [h["epoch"]   for h in hist]
+        val_maes = [h["val_mae"] for h in hist]
+        ax2.plot(epochs_h, val_maes, linestyle=ls, color=color, linewidth=1.8,
+                 label=name, marker=mk, markersize=3, markevery=5)
+    ax2.set_xlabel("Epoch", fontsize=11)
+    ax2.set_ylabel("Validation MAE (weighted average)", fontsize=11)
+    ax2.set_title("Validation MAE during training — all baseline models", fontsize=11)
+    ax2.legend(fontsize=9)
+    ax2.grid(True, alpha=0.3)
+    plt.tight_layout()
+    fig2.savefig(save_dir / "fig_baseline_val_mae_curves.pdf",
+                 dpi=300, bbox_inches="tight")
+    fig2.savefig(save_dir / "fig_baseline_val_mae_curves.png",
+                 dpi=300, bbox_inches="tight")
+    plt.close(fig2)
+    log.info("Saved: fig_baseline_val_mae_curves")
+# ─────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True)
+    args   = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    set_seed(cfg.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    log.info("Device: %s  Dataset: %s",
+             device, "full 239k" if cfg.data.n_samples is None
+             else f"{cfg.data.n_samples:,}")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32       = True
+    save_dir = Path(cfg.outputs.figures_dir) / "comparison"
+    ckpt_dir = Path(cfg.outputs.checkpoint_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)
+    all_results   = []
+    all_histories = {}
+    # ─── B1: ResNet-18 + independent MSE (sigmoid) ────────────
+    log.info("=" * 60)
+    log.info("B1: ResNet-18 + independent MSE (sigmoid, no hierarchy)")
+    log.info("=" * 60)
+    set_seed(cfg.seed)
+    rn_mse_model = ResNet18Baseline(dropout=cfg.model.dropout).to(device)
+    rn_mse_loss  = IndependentMSELoss()
+    log.info("ResNet-18 params: %s", f"{sum(p.numel() for p in rn_mse_model.parameters()):,}")
+    rn_mse_metrics, rn_mse_val, rn_mse_epoch, rn_mse_hist = train_and_evaluate(
+        rn_mse_model, rn_mse_loss, cfg, device,
+        label            = "B1-ResNet18-MSE",
+        checkpoint_path  = str(ckpt_dir / "baseline_resnet18_mse.pt"),
+        use_layerwise_lr = False,
+        use_sigmoid      = True,
+    )
+    # Simplex violation for this baseline
+    _, _, test_loader_tmp = build_dataloaders(cfg)
+    rn_mse_model.eval()
+    tmp_preds = []
+    with torch.no_grad():
+        for images, _, _, _ in test_loader_tmp:
+            images = images.to(device, non_blocking=True)
+            logits = rn_mse_model(images)
+            tmp_preds.append(torch.sigmoid(logits).cpu().numpy())
+    tmp_preds = np.concatenate(tmp_preds)
+    svr       = simplex_violation_rate(tmp_preds, tolerance=0.02)
+    log.info("B1 simplex violation rate (mean): %.4f", svr["mean"])
+    row = {
+        "model": "ResNet-18 + MSE (sigmoid, no hierarchy)",
+        "backbone": "ResNet-18", "loss": "Independent MSE",
+        "hierarchy": "None",
+        "best_epoch": rn_mse_epoch, "best_val_loss": round(rn_mse_val, 5),
+        "mae_weighted" : round(rn_mse_metrics["mae/weighted_avg"],  5),
+        "rmse_weighted": round(rn_mse_metrics["rmse/weighted_avg"], 5),
+        "simplex_violation_mean": round(svr["mean"], 4),
+    }
+    for q in QUESTION_GROUPS:
+        row[f"mae_{q}"]  = round(rn_mse_metrics[f"mae/{q}"],  5)
+        row[f"rmse_{q}"] = round(rn_mse_metrics[f"rmse/{q}"], 5)
+    all_results.append(row)
+    all_histories["ResNet-18 + MSE (sigmoid)"] = rn_mse_hist
+    log.info("B1 done: MAE=%.5f  RMSE=%.5f  SimplexViol=%.4f",
+             rn_mse_metrics["mae/weighted_avg"],
+             rn_mse_metrics["rmse/weighted_avg"],
+             svr["mean"])
+    # ─── B2: ResNet-18 + hierarchical KL+MSE ──────────────────
+    log.info("=" * 60)
+    log.info("B2: ResNet-18 + hierarchical KL+MSE (same loss as proposed)")
+    log.info("=" * 60)
+    set_seed(cfg.seed)
+    rn_kl_model = ResNet18Baseline(dropout=cfg.model.dropout).to(device)
+    rn_kl_loss  = HierarchicalLoss(cfg)
+    rn_kl_metrics, rn_kl_val, rn_kl_epoch, rn_kl_hist = train_and_evaluate(
+        rn_kl_model, rn_kl_loss, cfg, device,
+        label            = "B2-ResNet18-KL+MSE",
+        checkpoint_path  = str(ckpt_dir / "baseline_resnet18_klmse.pt"),
+        use_layerwise_lr = False,
+        use_sigmoid      = False,
+    )
+    row = {
+        "model": "ResNet-18 + hierarchical KL+MSE",
+        "backbone": "ResNet-18", "loss": "Hierarchical KL+MSE (λ=0.5)",
+        "hierarchy": "Full (weights + KL)",
+        "best_epoch": rn_kl_epoch, "best_val_loss": round(rn_kl_val, 5),
+        "mae_weighted" : round(rn_kl_metrics["mae/weighted_avg"],  5),
+        "rmse_weighted": round(rn_kl_metrics["rmse/weighted_avg"], 5),
+        "simplex_violation_mean": 0.0,   # softmax guarantees validity
+    }
+    for q in QUESTION_GROUPS:
+        row[f"mae_{q}"]  = round(rn_kl_metrics[f"mae/{q}"],  5)
+        row[f"rmse_{q}"] = round(rn_kl_metrics[f"rmse/{q}"], 5)
+    all_results.append(row)
+    all_histories["ResNet-18 + KL+MSE"] = rn_kl_hist
+    log.info("B2 done: MAE=%.5f  RMSE=%.5f",
+             rn_kl_metrics["mae/weighted_avg"],
+             rn_kl_metrics["rmse/weighted_avg"])
+    # ─── B3: ViT-Base + hierarchical MSE only ─────────────────
+    log.info("=" * 60)
+    log.info("B3: ViT-Base + hierarchical MSE only (no KL term)")
+    log.info("=" * 60)
+    set_seed(cfg.seed)
+    from omegaconf import OmegaConf as OC
+    vit_mse_cfg   = OC.merge(cfg, OC.create({"loss": {"lambda_kl": 0.0, "lambda_mse": 1.0}}))
+    vit_mse_model = build_model(vit_mse_cfg).to(device)
+    vit_mse_loss  = MSEOnlyLoss(vit_mse_cfg)
+    vit_mse_metrics, vit_mse_val, vit_mse_epoch, vit_mse_hist = train_and_evaluate(
+        vit_mse_model, vit_mse_loss, vit_mse_cfg, device,
+        label            = "B3-ViT-MSE",
+        checkpoint_path  = str(ckpt_dir / "baseline_vit_mse.pt"),
+        use_layerwise_lr = True,
+        use_sigmoid      = False,
+    )
+    row = {
+        "model": "ViT-Base + hierarchical MSE (no KL)",
+        "backbone": "ViT-Base/16", "loss": "Hierarchical MSE (λ_KL=0)",
+        "hierarchy": "Weights only",
+        "best_epoch": vit_mse_epoch, "best_val_loss": round(vit_mse_val, 5),
+        "mae_weighted" : round(vit_mse_metrics["mae/weighted_avg"],  5),
+        "rmse_weighted": round(vit_mse_metrics["rmse/weighted_avg"], 5),
+        "simplex_violation_mean": 0.0,
+    }
+    for q in QUESTION_GROUPS:
+        row[f"mae_{q}"]  = round(vit_mse_metrics[f"mae/{q}"],  5)
+        row[f"rmse_{q}"] = round(vit_mse_metrics[f"rmse/{q}"], 5)
+    all_results.append(row)
+    all_histories["ViT-Base + MSE only"] = vit_mse_hist
+    log.info("B3 done: MAE=%.5f  RMSE=%.5f",
+             vit_mse_metrics["mae/weighted_avg"],
+             vit_mse_metrics["rmse/weighted_avg"])
+    # ─── B4: ViT-Base + Dirichlet NLL (Zoobot-style) ──────────
+    log.info("=" * 60)
+    log.info("B4: ViT-Base + Dirichlet NLL (Walmsley et al. 2022)")
+    log.info("=" * 60)
+    set_seed(cfg.seed)
+    vit_dir_model = build_dirichlet_model(cfg).to(device)
+    vit_dir_loss  = DirichletLoss(cfg)
+    vit_dir_metrics, vit_dir_val, vit_dir_epoch, vit_dir_hist = train_and_evaluate_dirichlet(
+        vit_dir_model, vit_dir_loss, cfg, device,
+        label           = "B4-ViT-Dirichlet",
+        checkpoint_path = str(ckpt_dir / "baseline_vit_dirichlet.pt"),
+    )
+    row = {
+        "model": "ViT-Base + Dirichlet NLL (Zoobot-style)",
+        "backbone": "ViT-Base/16", "loss": "Dirichlet NLL",
+        "hierarchy": "Full (weights + Dirichlet)",
+        "best_epoch": vit_dir_epoch, "best_val_loss": round(vit_dir_val, 5),
+        "mae_weighted" : round(vit_dir_metrics["mae/weighted_avg"],  5),
+        "rmse_weighted": round(vit_dir_metrics["rmse/weighted_avg"], 5),
+        "simplex_violation_mean": 0.0,
+    }
+    for q in QUESTION_GROUPS:
+        row[f"mae_{q}"]  = round(vit_dir_metrics[f"mae/{q}"],  5)
+        row[f"rmse_{q}"] = round(vit_dir_metrics[f"rmse/{q}"], 5)
+    all_results.append(row)
+    all_histories["ViT-Base + Dirichlet"] = vit_dir_hist
+    log.info("B4 done: MAE=%.5f  RMSE=%.5f",
+             vit_dir_metrics["mae/weighted_avg"],
+             vit_dir_metrics["rmse/weighted_avg"])
+    # ─── Proposed: load existing checkpoint for final table ────
+    proposed_ckpt = ckpt_dir / "best_full_train.pt"
+    if proposed_ckpt.exists():
+        log.info("=" * 60)
+        log.info("PROPOSED: Loading ViT-Base + hierarchical KL+MSE")
+        log.info("=" * 60)
+        proposed_model = build_model(cfg).to(device)
+        proposed_model.load_state_dict(
+            torch.load(proposed_ckpt, map_location="cpu", weights_only=True)["model_state"]
+        )
+        _, _, test_loader_p = build_dataloaders(cfg)
+        _, proposed_metrics = _val_epoch(
+            proposed_model, test_loader_p, HierarchicalLoss(cfg), device, cfg,
+            epoch=0, label="Proposed-test", use_sigmoid=False
+        )
+        ckpt_info = torch.load(proposed_ckpt, map_location="cpu", weights_only=True)
+        row = {
+            "model": "ViT-Base + hierarchical KL+MSE (proposed)",
+            "backbone": "ViT-Base/16", "loss": "Hierarchical KL+MSE (λ=0.5)",
+            "hierarchy": "Full (weights + KL)",
+            "best_epoch": ckpt_info["epoch"],
+            "best_val_loss": round(ckpt_info["val_loss"], 5),
+            "mae_weighted" : round(proposed_metrics["mae/weighted_avg"],  5),
+            "rmse_weighted": round(proposed_metrics["rmse/weighted_avg"], 5),
+            "simplex_violation_mean": 0.0,
+        }
+        for q in QUESTION_GROUPS:
+            row[f"mae_{q}"]  = round(proposed_metrics[f"mae/{q}"],  5)
+            row[f"rmse_{q}"] = round(proposed_metrics[f"rmse/{q}"], 5)
+        all_results.append(row)
+        log.info("Proposed: MAE=%.5f  RMSE=%.5f",
+                 proposed_metrics["mae/weighted_avg"],
+                 proposed_metrics["rmse/weighted_avg"])
+    # ─── Save results ──────────────────────────────────────────
+    df = pd.DataFrame(all_results)
+    df.to_csv(save_dir / "table_baseline_comparison.csv", index=False)
+    summary_cols = ["model", "loss", "hierarchy", "best_epoch",
+                    "best_val_loss", "mae_weighted", "rmse_weighted",
+                    "simplex_violation_mean"]
+    summary = df[[c for c in summary_cols if c in df.columns]].copy()
+    summary.to_csv(save_dir / "table_baseline_summary.csv", index=False)
+    print()
+    print("=" * 80)
+    print("BASELINE COMPARISON — FINAL RESULTS")
+    print("=" * 80)
+    print(summary.to_string(index=False))
+    print()
+    # ─── Figures ───────────────────────────────────────────────
+    _save_comparison_figures(all_results, all_histories, save_dir)
+    log.info("All baseline outputs saved to: %s", save_dir)
+if __name__ == "__main__":
+    main()

src/dataset.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+src/dataset.py
+--------------
+Galaxy Zoo 2 dataset loader for hierarchical probabilistic regression.
+The GZ2 decision tree has 11 questions (t01-t11) with 37 total answer
+columns. Each question is a conditional probability vector — not
+independent regression targets.
+Hierarchy (parent answer -> child question):
+  t01_a02 (features/disk) -> t02, t03, t04, t05, t06
+  t02_a05 (not edge-on)   -> t03, t04
+  t04_a08 (has spiral)    -> t10, t11
+  t06_a14 (odd feature)   -> t08
+  t01_a01 (smooth)        -> t07
+  t02_a04 (edge-on)       -> t09
+References
+----------
+Willett et al. (2013), MNRAS 435, 2835
+Hart   et al. (2016), MNRAS 461, 3663
+"""
+import math
+import logging
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from PIL import Image
+from omegaconf import DictConfig
+log = logging.getLogger(__name__)
+# ─────────────────────────────────────────────────────────────
+# GZ2 decision tree definition
+# ─────────────────────────────────────────────────────────────
+LABEL_COLUMNS = [
+    # t01: smooth or features?
+    "t01_smooth_or_features_a01_smooth_debiased",
+    "t01_smooth_or_features_a02_features_or_disk_debiased",
+    "t01_smooth_or_features_a03_star_or_artifact_debiased",
+    # t02: edge-on?
+    "t02_edgeon_a04_yes_debiased",
+    "t02_edgeon_a05_no_debiased",
+    # t03: bar?
+    "t03_bar_a06_bar_debiased",
+    "t03_bar_a07_no_bar_debiased",
+    # t04: spiral?
+    "t04_spiral_a08_spiral_debiased",
+    "t04_spiral_a09_no_spiral_debiased",
+    # t05: bulge prominence
+    "t05_bulge_prominence_a10_no_bulge_debiased",
+    "t05_bulge_prominence_a11_just_noticeable_debiased",
+    "t05_bulge_prominence_a12_obvious_debiased",
+    "t05_bulge_prominence_a13_dominant_debiased",
+    # t06: odd feature?
+    "t06_odd_a14_yes_debiased",
+    "t06_odd_a15_no_debiased",
+    # t07: roundedness (smooth galaxies)
+    "t07_rounded_a16_completely_round_debiased",
+    "t07_rounded_a17_in_between_debiased",
+    "t07_rounded_a18_cigar_shaped_debiased",
+    # t08: odd feature type
+    "t08_odd_feature_a19_ring_debiased",
+    "t08_odd_feature_a20_lens_or_arc_debiased",
+    "t08_odd_feature_a21_disturbed_debiased",
+    "t08_odd_feature_a22_irregular_debiased",
+    "t08_odd_feature_a23_other_debiased",
+    "t08_odd_feature_a24_merger_debiased",
+    "t08_odd_feature_a38_dust_lane_debiased",
+    # t09: bulge shape (edge-on only)
+    "t09_bulge_shape_a25_rounded_debiased",
+    "t09_bulge_shape_a26_boxy_debiased",
+    "t09_bulge_shape_a27_no_bulge_debiased",
+    # t10: arms winding
+    "t10_arms_winding_a28_tight_debiased",
+    "t10_arms_winding_a29_medium_debiased",
+    "t10_arms_winding_a30_loose_debiased",
+    # t11: arms number
+    "t11_arms_number_a31_1_debiased",
+    "t11_arms_number_a32_2_debiased",
+    "t11_arms_number_a33_3_debiased",
+    "t11_arms_number_a34_4_debiased",
+    "t11_arms_number_a36_more_than_4_debiased",
+    "t11_arms_number_a37_cant_tell_debiased",
+]
+# Slice indices into LABEL_COLUMNS for each question group.
+QUESTION_GROUPS = {
+    "t01": (0,  3),
+    "t02": (3,  5),
+    "t03": (5,  7),
+    "t04": (7,  9),
+    "t05": (9,  13),
+    "t06": (13, 15),
+    "t07": (15, 18),
+    "t08": (18, 25),
+    "t09": (25, 28),
+    "t10": (28, 31),
+    "t11": (31, 37),
+}
+# Parent answer column for hierarchical branch weighting.
+# w_q = vote fraction of the parent answer that unlocks question q.
+# t01 is the root question; its weight is always 1.0.
+QUESTION_PARENT_COL = {
+    "t01": None,
+    "t02": "t01_smooth_or_features_a02_features_or_disk_debiased",
+    "t03": "t02_edgeon_a05_no_debiased",
+    "t04": "t02_edgeon_a05_no_debiased",
+    "t05": "t01_smooth_or_features_a02_features_or_disk_debiased",
+    "t06": "t01_smooth_or_features_a02_features_or_disk_debiased",
+    "t07": "t01_smooth_or_features_a01_smooth_debiased",
+    "t08": "t06_odd_a14_yes_debiased",
+    "t09": "t02_edgeon_a04_yes_debiased",
+    "t10": "t04_spiral_a08_spiral_debiased",
+    "t11": "t04_spiral_a08_spiral_debiased",
+}
+N_LABELS = len(LABEL_COLUMNS)  # 37
+# ─────────────────────────────────────────────────────────────
+# Image transforms
+# ─────────────────────────────────────────────────────────────
+def get_transforms(image_size: int, split: str) -> transforms.Compose:
+    """
+    Training: random flips + rotations (galaxies have no preferred orientation),
+              colour jitter (instrument variation), ImageNet normalisation.
+    Val/Test: resize only, ImageNet normalisation.
+    """
+    mean = [0.485, 0.456, 0.406]
+    std  = [0.229, 0.224, 0.225]
+    if split == "train":
+        return transforms.Compose([
+            transforms.Resize((image_size + 16, image_size + 16)),
+            transforms.RandomCrop(image_size),
+            transforms.RandomHorizontalFlip(),
+            transforms.RandomVerticalFlip(),
+            transforms.RandomRotation(180),
+            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+    else:
+        return transforms.Compose([
+            transforms.Resize((image_size, image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+# ─────────────────────────────────────────────────────────────
+# Dataset
+# ─────────────────────────────────────────────────────────────
+class GalaxyZoo2Dataset(Dataset):
+    """
+    PyTorch Dataset for Galaxy Zoo 2.
+    Returns
+    -------
+    image    : FloatTensor [3, H, W]   normalised galaxy image
+    targets  : FloatTensor [37]        vote fraction vector
+    weights  : FloatTensor [11]        per-question hierarchical weights
+    image_id : int                     dr7objid for traceability
+    """
+    def __init__(self, df: pd.DataFrame, image_dir: str, transform):
+        self.df        = df.reset_index(drop=True)
+        self.image_dir = Path(image_dir)
+        self.transform = transform
+        self.labels    = self.df[LABEL_COLUMNS].values.astype(np.float32)
+        self.weights   = self._compute_weights()
+        self.image_ids = self.df["dr7objid"].tolist()
+    def _compute_weights(self) -> np.ndarray:
+        n       = len(self.df)
+        q_names = list(QUESTION_GROUPS.keys())
+        weights = np.ones((n, len(q_names)), dtype=np.float32)
+        for q_idx, q_name in enumerate(q_names):
+            parent_col = QUESTION_PARENT_COL[q_name]
+            if parent_col is not None:
+                weights[:, q_idx] = self.df[parent_col].values.astype(np.float32)
+        return weights
+    def __len__(self) -> int:
+        return len(self.df)
+    def __getitem__(self, idx: int):
+        image_id = self.image_ids[idx]
+        img_path = self.image_dir / f"{image_id}.jpg"
+        try:
+            image = Image.open(img_path).convert("RGB")
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"Image not found: {img_path}. "
+                f"Check dr7objid {image_id} has a matching .jpg file."
+            )
+        image   = self.transform(image)
+        targets = torch.from_numpy(self.labels[idx])
+        weights = torch.from_numpy(self.weights[idx])
+        return image, targets, weights, image_id
+# ─────────────────────────────────────────────────────────────
+# DataLoader factory
+# ─────────────────────────────────────────────────────────────
+def build_dataloaders(cfg: DictConfig):
+    """Build train / val / test DataLoaders from the labels parquet."""
+    log.info("Loading parquet: %s", cfg.data.parquet_path)
+    df = pd.read_parquet(cfg.data.parquet_path)
+    missing = [c for c in LABEL_COLUMNS if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing columns in parquet: {missing}")
+    if cfg.data.n_samples is not None:
+        n = int(cfg.data.n_samples)
+        log.info("Using subset of %d samples (full dataset: %d)", n, len(df))
+        df = df.sample(n=n, random_state=cfg.seed).reset_index(drop=True)
+    else:
+        log.info("Using full dataset: %d samples", len(df))
+    rng     = np.random.default_rng(cfg.seed)
+    idx     = rng.permutation(len(df))
+    n       = len(df)
+    n_train = math.floor(cfg.data.train_frac * n)
+    n_val   = math.floor(cfg.data.val_frac   * n)
+    train_idx = idx[:n_train]
+    val_idx   = idx[n_train : n_train + n_val]
+    test_idx  = idx[n_train + n_val :]
+    log.info("Split — train: %d  val: %d  test: %d",
+             len(train_idx), len(val_idx), len(test_idx))
+    image_size = cfg.data.image_size
+    train_ds   = GalaxyZoo2Dataset(
+        df.iloc[train_idx], cfg.data.image_dir,
+        get_transforms(image_size, "train"))
+    val_ds     = GalaxyZoo2Dataset(
+        df.iloc[val_idx], cfg.data.image_dir,
+        get_transforms(image_size, "val"))
+    test_ds    = GalaxyZoo2Dataset(
+        df.iloc[test_idx], cfg.data.image_dir,
+        get_transforms(image_size, "test"))
+    common = dict(
+        batch_size         = cfg.training.batch_size,
+        num_workers        = cfg.data.num_workers,
+        pin_memory         = cfg.data.pin_memory,
+        persistent_workers = getattr(cfg.data, "persistent_workers", True),
+        prefetch_factor    = getattr(cfg.data, "prefetch_factor", 4),
+        drop_last          = False,
+    )
+    train_loader = DataLoader(train_ds, shuffle=True,  **common)
+    val_loader   = DataLoader(val_ds,   shuffle=False, **common)
+    test_loader  = DataLoader(test_ds,  shuffle=False, **common)
+    return train_loader, val_loader, test_loader

src/evaluate_full.py ADDED Viewed

	@@ -0,0 +1,619 @@

+"""
+src/evaluate_full.py
+--------------------
+Full evaluation of all trained models on the held-out test set.
+Generates all paper figures and tables:
+Tables
+------
+  table_metrics_proposed.csv          — MAE / RMSE / bias / ECE for proposed model
+  table_reached_branch_mae.csv        — reached-branch MAE across all 5 models
+  table_simplex_violation.csv         — simplex validity for sigmoid baseline
+Figures (PDF + PNG, IEEE naming convention)
+-------------------------------------------
+  fig_scatter_predicted_vs_true.pdf   — predicted vs true vote fractions (proposed)
+  fig_calibration_reliability.pdf     — reliability diagrams, all models
+  fig_ece_comparison.pdf              — ECE bar chart, all models
+  fig_attention_rollout_gallery.pdf   — full 12-layer attention rollout gallery
+  fig_attention_entropy_depth.pdf     — CLS attention entropy vs. layer depth
+Usage
+-----
+    cd ~/galaxy
+    nohup python -m src.evaluate_full --config configs/full_train.yaml \
+        > outputs/logs/evaluate.log 2>&1 &
+    echo "PID: $!"
+"""
+import argparse
+import logging
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from torch.amp import autocast
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from src.dataset       import build_dataloaders, QUESTION_GROUPS
+from src.model         import build_model, build_dirichlet_model
+from src.metrics       import (compute_metrics, predictions_to_numpy,
+                                compute_reached_branch_mae_table,
+                                dirichlet_predictions_to_numpy,
+                                simplex_violation_rate, _compute_ece)
+from src.attention_viz import plot_attention_grid, plot_attention_entropy
+from src.baselines     import ResNet18Baseline
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s %(name)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("evaluate_full")
+# ── Global matplotlib style ────────────────────────────────────────────────────
+plt.rcParams.update({
+    "figure.dpi"       : 150,
+    "savefig.dpi"      : 300,
+    "font.family"      : "serif",
+    "font.size"        : 11,
+    "axes.titlesize"   : 11,
+    "axes.labelsize"   : 11,
+    "xtick.labelsize"  : 9,
+    "ytick.labelsize"  : 9,
+    "legend.fontsize"  : 9,
+    "figure.facecolor" : "white",
+    "axes.facecolor"   : "white",
+    "axes.grid"        : True,
+    "grid.alpha"       : 0.3,
+    "pdf.fonttype"     : 42,   # editable text in PDF
+    "ps.fonttype"      : 42,
+})
+QUESTION_LABELS = {
+    "t01": "Smooth or features",
+    "t02": "Edge-on disk",
+    "t03": "Bar",
+    "t04": "Spiral arms",
+    "t05": "Bulge prominence",
+    "t06": "Odd feature",
+    "t07": "Roundedness",
+    "t08": "Odd feature type",
+    "t09": "Bulge shape",
+    "t10": "Arms winding",
+    "t11": "Arms number",
+}
+# Consistent colours and line styles for all models across all figures
+MODEL_COLORS = {
+    "ResNet-18 + MSE (sigmoid)"          : "#c0392b",
+    "ResNet-18 + KL+MSE"                 : "#e67e22",
+    "ViT-Base + MSE only"                : "#2980b9",
+    "ViT-Base + KL+MSE (proposed)"       : "#27ae60",
+    "ViT-Base + Dirichlet (Zoobot-style)": "#8e44ad",
+}
+MODEL_STYLES = {
+    "ResNet-18 + MSE (sigmoid)"          : "-",
+    "ResNet-18 + KL+MSE"                 : "-.",
+    "ViT-Base + MSE only"                : "--",
+    "ViT-Base + KL+MSE (proposed)"       : "-",
+    "ViT-Base + Dirichlet (Zoobot-style)": ":",
+}
+# ─────────────────────────────────────────────────────────────
+# Inference helpers
+# ─────────────────────────────────────────────────────────────
+def _infer_vit(model, loader, device, cfg,
+               collect_attn=True, n_attn=16):
+    model.eval()
+    all_preds, all_targets, all_weights = [], [], []
+    attn_images, all_layer_attns, attn_ids = [], [], []
+    attn_done = False
+    with torch.no_grad():
+        for images, targets, weights, image_ids in tqdm(loader, desc="ViT inference"):
+            images  = images.to(device, non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                logits = model(images)
+            p, t, w = predictions_to_numpy(logits, targets, weights)
+            all_preds.append(p)
+            all_targets.append(t)
+            all_weights.append(w)
+            if collect_attn and not attn_done:
+                layers = model.get_all_attention_weights()
+                if layers is not None:
+                    n = min(n_attn, images.shape[0])
+                    attn_images.append(images[:n].cpu())
+                    all_layer_attns.append([l[:n].cpu() for l in layers])
+                    attn_ids.extend([int(i) for i in image_ids[:n]])
+                    if len(attn_ids) >= n_attn:
+                        attn_done = True
+    preds   = np.concatenate(all_preds)
+    targets = np.concatenate(all_targets)
+    weights = np.concatenate(all_weights)
+    attn_imgs_t   = torch.cat(attn_images, dim=0)[:n_attn] if attn_images else None
+    merged_layers = None
+    if all_layer_attns:
+        merged_layers = [
+            torch.cat([b[li] for b in all_layer_attns], dim=0)[:n_attn]
+            for li in range(len(all_layer_attns[0]))
+        ]
+    return preds, targets, weights, attn_imgs_t, merged_layers, attn_ids
+def _infer_resnet(model, loader, device, cfg, use_sigmoid: bool):
+    model.eval()
+    all_preds, all_targets, all_weights = [], [], []
+    with torch.no_grad():
+        for images, targets, weights, _ in tqdm(loader, desc="ResNet inference"):
+            images = images.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                logits = model(images)
+            if use_sigmoid:
+                pred = torch.sigmoid(logits).cpu().numpy()
+            else:
+                pred = logits.detach().cpu().clone()
+                for q, (s, e) in QUESTION_GROUPS.items():
+                    pred[:, s:e] = F.softmax(pred[:, s:e], dim=-1)
+                pred = pred.numpy()
+            all_preds.append(pred)
+            all_targets.append(targets.numpy())
+            all_weights.append(weights.numpy())
+    return (np.concatenate(all_preds),
+            np.concatenate(all_targets),
+            np.concatenate(all_weights))
+def _infer_dirichlet(model, loader, device, cfg):
+    model.eval()
+    all_preds, all_targets, all_weights = [], [], []
+    with torch.no_grad():
+        for images, targets, weights, _ in tqdm(loader, desc="Dirichlet inference"):
+            images = images.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                alpha = model(images)
+            p, t, w = dirichlet_predictions_to_numpy(alpha, targets, weights)
+            all_preds.append(p)
+            all_targets.append(t)
+            all_weights.append(w)
+    return (np.concatenate(all_preds),
+            np.concatenate(all_targets),
+            np.concatenate(all_weights))
+# ─────────────────────────────────────────────────────────────
+# Figure 1: Predicted vs true scatter (proposed model)
+# ─────────────────────────────────────────────────────────────
+def fig_scatter_predicted_vs_true(preds, targets, weights, save_dir):
+    path_pdf = save_dir / "fig_scatter_predicted_vs_true.pdf"
+    path_png = save_dir / "fig_scatter_predicted_vs_true.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_scatter_predicted_vs_true"); return
+    fig, axes = plt.subplots(3, 4, figsize=(16, 12))
+    axes = axes.flatten()
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        ax   = axes[q_idx]
+        mask = weights[:, q_idx] >= 0.05
+        pq   = preds[mask,   start:end].flatten()
+        tq   = targets[mask, start:end].flatten()
+        ax.scatter(tq, pq, alpha=0.06, s=1, color="#2563eb", rasterized=True)
+        ax.plot([0, 1], [0, 1], "r--", linewidth=1, alpha=0.8)
+        ax.set_xlim(0, 1); ax.set_ylim(0, 1)
+        ax.set_xlabel("True vote fraction")
+        ax.set_ylabel("Predicted vote fraction")
+        ax.set_title(
+            f"{q_name}: {QUESTION_LABELS[q_name]}\n"
+            f"$n$ = {mask.sum():,} (w ≥ 0.05)",
+            fontsize=9,
+        )
+        ax.set_aspect("equal")
+        mae = np.abs(pq - tq).mean()
+        ax.text(0.05, 0.92, f"MAE = {mae:.3f}",
+                transform=ax.transAxes, fontsize=8,
+                bbox=dict(boxstyle="round,pad=0.2", facecolor="white",
+                          edgecolor="grey", alpha=0.85))
+    axes[-1].axis("off")
+    plt.suptitle(
+        "Predicted vs. true vote fractions — reached branches (w ≥ 0.05)\n"
+        "ViT-Base/16 + hierarchical KL+MSE (proposed model, test set)",
+        fontsize=12,
+    )
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_scatter_predicted_vs_true")
+# ─────────────────────────────────────────────────────────────
+# Figure 2: Calibration reliability diagrams
+# ─────────────────────────────────────────────────────────────
+def fig_calibration_reliability(model_results, save_dir, n_bins=15):
+    path_pdf = save_dir / "fig_calibration_reliability.pdf"
+    path_png = save_dir / "fig_calibration_reliability.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_calibration_reliability"); return
+    # Show 8 representative questions (skip t02 — bimodal, shown separately)
+    q_show = ["t01", "t03", "t04", "t06", "t07", "t09", "t10", "t11"]
+    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
+    axes = axes.flatten()
+    for ax_idx, q_name in enumerate(q_show):
+        ax = axes[ax_idx]
+        start, end = QUESTION_GROUPS[q_name]
+        q_idx = list(QUESTION_GROUPS.keys()).index(q_name)
+        for model_name, (preds, targets, weights) in model_results.items():
+            mask = weights[:, q_idx] >= 0.05
+            if mask.sum() < 50:
+                continue
+            pf = preds[mask,   start:end].flatten()
+            tf = targets[mask, start:end].flatten()
+            # Adaptive bins (equal-frequency) — consistent with ECE computation
+            percentiles = np.linspace(0, 100, n_bins + 1)
+            bin_edges   = np.unique(np.percentile(pf, percentiles))
+            if len(bin_edges) < 2:
+                continue
+            bin_ids = np.clip(
+                np.digitize(pf, bin_edges[1:-1]), 0, len(bin_edges) - 2
+            )
+            mp = np.array([
+                pf[bin_ids == b].mean() if (bin_ids == b).any() else np.nan
+                for b in range(len(bin_edges) - 1)
+            ])
+            mt = np.array([
+                tf[bin_ids == b].mean() if (bin_ids == b).any() else np.nan
+                for b in range(len(bin_edges) - 1)
+            ])
+            valid = ~np.isnan(mp) & ~np.isnan(mt)
+            ax.plot(
+                mp[valid], mt[valid],
+                MODEL_STYLES.get(model_name, "-"),
+                color=MODEL_COLORS.get(model_name, "#888888"),
+                linewidth=1.8, marker="o", markersize=3.5,
+                label=model_name, alpha=0.9,
+            )
+        ax.plot([0, 1], [0, 1], "k--", linewidth=1, alpha=0.5, label="Perfect")
+        ax.set_xlim(0, 1); ax.set_ylim(0, 1)
+        ax.set_xlabel("Mean predicted", fontsize=8)
+        ax.set_ylabel("Mean true", fontsize=8)
+        ax.set_title(f"{q_name}: {QUESTION_LABELS[q_name]}", fontsize=9)
+        ax.set_aspect("equal")
+        if ax_idx == 0:
+            ax.legend(fontsize=6.5, loc="upper left")
+    plt.suptitle(
+        "Calibration reliability diagrams — all models (test set)\n"
+        "Reached branches only (w ≥ 0.05). Adaptive equal-frequency bins. "
+        "Closer to diagonal = better calibrated.",
+        fontsize=11,
+    )
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_calibration_reliability")
+# ─────────────────────────────────────────────────────────────
+# Figure 3: ECE bar chart
+# ─────────────────────────────────────────────────────────────
+def fig_ece_comparison(model_results, save_dir):
+    path_pdf = save_dir / "fig_ece_comparison.pdf"
+    path_png = save_dir / "fig_ece_comparison.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_ece_comparison"); return
+    q_names  = list(QUESTION_GROUPS.keys())
+    ece_rows = []
+    for model_name, (preds, targets, weights) in model_results.items():
+        row = {"model": model_name}
+        for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+            mask = weights[:, q_idx] >= 0.05
+            if mask.sum() < 50:
+                row[q_name] = float("nan")
+            else:
+                row[q_name] = _compute_ece(
+                    preds[mask,   start:end].flatten(),
+                    targets[mask, start:end].flatten(),
+                    n_bins=15,
+                )
+        row["mean_ece"] = float(
+            np.nanmean([row[q] for q in q_names])
+        )
+        ece_rows.append(row)
+    df_ece = pd.DataFrame(ece_rows)
+    df_ece.to_csv(save_dir / "table_ece_comparison.csv", index=False)
+    x     = np.arange(len(q_names))
+    width = 0.80 / len(model_results)
+    palette = list(MODEL_COLORS.values())
+    fig, ax = plt.subplots(figsize=(14, 5))
+    for i, (model_name, _) in enumerate(model_results.items()):
+        vals = [
+            float(df_ece[df_ece["model"] == model_name][q].values[0])
+            for q in q_names
+        ]
+        ax.bar(
+            x + i * width, vals, width,
+            label=model_name,
+            color=MODEL_COLORS.get(model_name, palette[i % len(palette)]),
+            alpha=0.85, edgecolor="white", linewidth=0.5,
+        )
+    ax.set_xticks(x + width * (len(model_results) - 1) / 2)
+    ax.set_xticklabels(
+        [f"{q}\n({QUESTION_LABELS[q][:12]})" for q in q_names],
+        rotation=30, ha="right", fontsize=8,
+    )
+    ax.set_ylabel("Expected Calibration Error (ECE)", fontsize=11)
+    ax.set_title(
+        "Expected Calibration Error — all models (test set)\n"
+        "Reached branches (w ≥ 0.05). Adaptive equal-frequency binning. "
+        "Lower is better.",
+        fontsize=11,
+    )
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3, axis="y")
+    ax.set_axisbelow(True)
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_ece_comparison")
+# ─────────────────────────────────────────────────────────────
+# Figure 4: Attention rollout gallery
+# ─────────────────────────────────────────────────────────────
+def fig_attention_rollout_gallery(attn_imgs, all_layers, attn_ids, save_dir):
+    if attn_imgs is None or all_layers is None:
+        log.warning("No attention data — skipping gallery."); return
+    path_pdf = save_dir / "fig_attention_rollout_gallery.pdf"
+    path_png = save_dir / "fig_attention_rollout_gallery.png"
+    if not path_pdf.exists():
+        fig = plot_attention_grid(
+            attn_imgs, all_layers, attn_ids,
+            save_path=str(path_png),
+            n_cols=4, rollout_mode="full",
+        )
+        fig.savefig(path_pdf, dpi=300, bbox_inches="tight", facecolor="black")
+        plt.close(fig)
+        log.info("Saved: fig_attention_rollout_gallery")
+    # High-resolution PNG for journal submission
+    path_hq = save_dir / "fig_attention_rollout_gallery_HQ.png"
+    if not path_hq.exists():
+        fig2 = plot_attention_grid(
+            attn_imgs, all_layers, attn_ids,
+            n_cols=4, rollout_mode="full",
+        )
+        fig2.savefig(path_hq, dpi=600, bbox_inches="tight", facecolor="black")
+        plt.close(fig2)
+        log.info("Saved: fig_attention_rollout_gallery_HQ (600 dpi)")
+# ─────────────────────────────────────────────────────────────
+# Figure 5: Attention entropy vs. depth
+# ─────────────────────────────────────────────────────────────
+def fig_attention_entropy_depth(all_layers, save_dir):
+    if all_layers is None:
+        log.warning("No attention layers — skipping entropy plot."); return
+    path_pdf = save_dir / "fig_attention_entropy_depth.pdf"
+    path_png = save_dir / "fig_attention_entropy_depth.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_attention_entropy_depth"); return
+    fig = plot_attention_entropy(all_layers, save_path=str(path_png))
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_attention_entropy_depth")
+# ─────────────────────────────────────────────────────────────
+# Table: metrics for proposed model
+# ─────────────────────────────────────────────────────────────
+def table_metrics_proposed(preds, targets, weights, save_dir):
+    metrics = compute_metrics(preds, targets, weights)
+    rows    = []
+    for q_name in QUESTION_GROUPS:
+        rows.append({
+            "question"   : q_name,
+            "description": QUESTION_LABELS[q_name],
+            "MAE"        : round(metrics[f"mae/{q_name}"],  5),
+            "RMSE"       : round(metrics[f"rmse/{q_name}"], 5),
+            "bias"       : round(metrics[f"bias/{q_name}"], 5),
+            "ECE"        : round(metrics[f"ece/{q_name}"],  5),
+        })
+    rows.append({
+        "question": "weighted_avg", "description": "Weighted average",
+        "MAE" : round(metrics["mae/weighted_avg"],  5),
+        "RMSE": round(metrics["rmse/weighted_avg"], 5),
+        "bias": "",
+        "ECE" : round(metrics["ece/mean"],          5),
+    })
+    df = pd.DataFrame(rows)
+    df.to_csv(save_dir / "table_metrics_proposed.csv", index=False)
+    log.info("\n%s\n", df.to_string(index=False))
+    return metrics
+# ─────���───────────────────────────────────────────────────────
+# Table: simplex violation for sigmoid baseline
+# ─────────────────────────────────────────────────────────────
+def table_simplex_violation(model_results, save_dir):
+    """
+    For each model, report the fraction of test samples where per-question
+    predictions do not sum to 1 ± 0.02. Expected: ~0 for softmax models,
+    nonzero for sigmoid baseline. This table explains why the sigmoid
+    baseline achieves lower raw per-answer MAE despite being scientifically
+    invalid: unconstrained sigmoid outputs fit each marginal independently.
+    """
+    rows = []
+    for model_name, (preds, _, _) in model_results.items():
+        svr = simplex_violation_rate(preds, tolerance=0.02)
+        row = {"model": model_name}
+        row.update({q: round(svr[q], 4) for q in QUESTION_GROUPS})
+        row["mean"] = round(svr["mean"], 4)
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    df.to_csv(save_dir / "table_simplex_violation.csv", index=False)
+    log.info("Saved: table_simplex_violation.csv")
+    log.info("\n%s\n", df[["model", "mean"]].to_string(index=False))
+    return df
+# ─────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    save_dir = Path(cfg.outputs.figures_dir) / "evaluation"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_dir = Path(cfg.outputs.checkpoint_dir)
+    _, _, test_loader = build_dataloaders(cfg)
+    # ── Load all models ────────────────────────────────────────
+    log.info("Loading models from: %s", ckpt_dir)
+    def _load(path, model):
+        ckpt = torch.load(path, map_location="cpu", weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+        return model
+    vit_proposed = _load(
+        ckpt_dir / "best_full_train.pt", build_model(cfg)
+    ).to(device)
+    vit_mse = _load(
+        ckpt_dir / "baseline_vit_mse.pt", build_model(cfg)
+    ).to(device)
+    rn_mse = _load(
+        ckpt_dir / "baseline_resnet18_mse.pt",
+        ResNet18Baseline(dropout=cfg.model.dropout)
+    ).to(device)
+    rn_kl = _load(
+        ckpt_dir / "baseline_resnet18_klmse.pt",
+        ResNet18Baseline(dropout=cfg.model.dropout)
+    ).to(device)
+    vit_dirichlet = None
+    dp = ckpt_dir / "baseline_vit_dirichlet.pt"
+    if dp.exists():
+        vit_dirichlet = _load(dp, build_dirichlet_model(cfg)).to(device)
+        log.info("Loaded: ViT-Base + Dirichlet")
+    # ── Run inference ──────────────────────────────────────────
+    log.info("Running inference on test set...")
+    (p_proposed, t_proposed, w_proposed,
+     attn_imgs, all_layers, attn_ids) = _infer_vit(
+        vit_proposed, test_loader, device, cfg,
+        collect_attn=True, n_attn=16,
+    )
+    p_vit_mse, t_vit_mse, w_vit_mse = _infer_vit(
+        vit_mse, test_loader, device, cfg, collect_attn=False
+    )[:3]
+    p_rn_mse, t_rn_mse, w_rn_mse = _infer_resnet(
+        rn_mse, test_loader, device, cfg, use_sigmoid=True
+    )
+    p_rn_kl, t_rn_kl, w_rn_kl = _infer_resnet(
+        rn_kl, test_loader, device, cfg, use_sigmoid=False
+    )
+    # Build model_results dict (order determines legend order in figures)
+    model_results = {
+        "ResNet-18 + MSE (sigmoid)"    : (p_rn_mse,   t_rn_mse,   w_rn_mse),
+        "ResNet-18 + KL+MSE"           : (p_rn_kl,    t_rn_kl,    w_rn_kl),
+        "ViT-Base + MSE only"          : (p_vit_mse,  t_vit_mse,  w_vit_mse),
+        "ViT-Base + KL+MSE (proposed)" : (p_proposed, t_proposed, w_proposed),
+    }
+    if vit_dirichlet is not None:
+        p_dir, t_dir, w_dir = _infer_dirichlet(
+            vit_dirichlet, test_loader, device, cfg
+        )
+        model_results["ViT-Base + Dirichlet (Zoobot-style)"] = (p_dir, t_dir, w_dir)
+    # ── Tables ─────────────────────────────────────────────────
+    log.info("Computing metrics...")
+    table_metrics_proposed(p_proposed, t_proposed, w_proposed, save_dir)
+    log.info("Computing reached-branch MAE table...")
+    df_r = compute_reached_branch_mae_table(model_results)
+    df_r.to_csv(save_dir / "table_reached_branch_mae.csv", index=False)
+    log.info("Saved: table_reached_branch_mae.csv")
+    log.info("Computing simplex violation table...")
+    table_simplex_violation(model_results, save_dir)
+    # ── Figures ────────────────────────────────────────────────
+    log.info("Generating figures...")
+    fig_scatter_predicted_vs_true(p_proposed, t_proposed, w_proposed, save_dir)
+    fig_calibration_reliability(model_results, save_dir)
+    fig_ece_comparison(model_results, save_dir)
+    fig_attention_rollout_gallery(attn_imgs, all_layers, attn_ids, save_dir)
+    fig_attention_entropy_depth(all_layers, save_dir)
+    log.info("=" * 60)
+    log.info("ALL OUTPUTS SAVED TO: %s", save_dir)
+    log.info("=" * 60)
+    metrics = compute_metrics(p_proposed, t_proposed, w_proposed)
+    log.info("Proposed model — test set results:")
+    log.info("  Weighted MAE  = %.5f", metrics["mae/weighted_avg"])
+    log.info("  Weighted RMSE = %.5f", metrics["rmse/weighted_avg"])
+    log.info("  Mean ECE      = %.5f", metrics["ece/mean"])
+if __name__ == "__main__":
+    main()

src/loss.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+src/loss.py
+-----------
+Loss functions for hierarchical probabilistic vote-fraction regression.
+Two losses are implemented:
+1. HierarchicalLoss  — proposed method: weighted KL + MSE per question.
+2. DirichletLoss     — Zoobot-style comparison: weighted Dirichlet NLL.
+3. MSEOnlyLoss       — ablation baseline: hierarchical MSE, no KL term.
+Both main losses use identical per-sample hierarchical weighting:
+    w_q = parent branch vote fraction  (1.0 for root question t01)
+Mathematical formulation
+------------------------
+HierarchicalLoss per question q:
+    L_q = w_q * [ λ_kl * KL(p_q || ŷ_q) + λ_mse * MSE(ŷ_q, p_q) ]
+    where  p_q = ground-truth vote fractions  [B, A_q]
+           ŷ_q = softmax(logits_q)            [B, A_q]
+           w_q = hierarchical weight           [B]
+DirichletLoss per question q:
+    L_q = w_q * [ log B(α_q) − Σ_a (α_qa − 1) log(p_qa) ]
+    where  α_q = 1 + softplus(logits_q)  > 1   [B, A_q]
+References
+----------
+Walmsley et al. (2022), MNRAS 509, 3966  (Zoobot — Dirichlet approach)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from omegaconf import DictConfig
+from src.dataset import QUESTION_GROUPS
+class HierarchicalLoss(nn.Module):
+    """Weighted hierarchical KL + MSE loss. Proposed method."""
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.lambda_kl       = float(cfg.loss.lambda_kl)
+        self.lambda_mse      = float(cfg.loss.lambda_mse)
+        self.epsilon         = float(cfg.loss.epsilon)
+        self.question_slices = [(q, s, e) for q, (s, e) in QUESTION_GROUPS.items()]
+    def forward(self, predictions: torch.Tensor,
+                targets: torch.Tensor, weights: torch.Tensor):
+        total_loss = torch.zeros(1, device=predictions.device, dtype=predictions.dtype)
+        loss_dict  = {}
+        for q_idx, (q_name, start, end) in enumerate(self.question_slices):
+            logits_q   = predictions[:, start:end]
+            target_q   = targets[:,    start:end]
+            weight_q   = weights[:, q_idx]
+            pred_q     = F.softmax(logits_q, dim=-1)
+            pred_q_c   = pred_q.clamp(min=self.epsilon, max=1.0)
+            target_q_c = target_q.clamp(min=self.epsilon, max=1.0)
+            kl_per_sample = (
+                target_q_c * (target_q_c.log() - pred_q_c.log())
+            ).sum(dim=-1)
+            mse_per_sample = F.mse_loss(
+                pred_q, target_q, reduction="none"
+            ).mean(dim=-1)
+            combined = (self.lambda_kl * kl_per_sample +
+                        self.lambda_mse * mse_per_sample)
+            q_loss   = (weight_q * combined).mean()
+            total_loss = total_loss + q_loss
+            loss_dict[f"loss/{q_name}"] = q_loss.detach().item()
+        loss_dict["loss/total"] = total_loss.detach().item()
+        return total_loss, loss_dict
+class DirichletLoss(nn.Module):
+    """
+    Weighted hierarchical Dirichlet negative log-likelihood.
+    Used to train GalaxyViTDirichlet for comparison with the proposed method.
+    Matches the Zoobot approach (Walmsley et al. 2022).
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.epsilon         = float(cfg.loss.epsilon)
+        self.question_slices = [(q, s, e) for q, (s, e) in QUESTION_GROUPS.items()]
+    def forward(self, alpha: torch.Tensor,
+                targets: torch.Tensor, weights: torch.Tensor):
+        total_loss = torch.zeros(1, device=alpha.device, dtype=alpha.dtype)
+        loss_dict  = {}
+        for q_idx, (q_name, start, end) in enumerate(self.question_slices):
+            alpha_q    = alpha[:,   start:end]
+            target_q   = targets[:, start:end]
+            weight_q   = weights[:, q_idx]
+            target_q_c = target_q.clamp(min=self.epsilon)
+            # log B(α) = Σ lgamma(α_a) − lgamma(Σ α_a)
+            log_beta = (
+                torch.lgamma(alpha_q).sum(dim=-1) -
+                torch.lgamma(alpha_q.sum(dim=-1))
+            )
+            # −Σ (α_a − 1) log(p_a)
+            log_likelihood = ((alpha_q - 1.0) * target_q_c.log()).sum(dim=-1)
+            nll_per_sample = log_beta - log_likelihood
+            q_loss = (weight_q * nll_per_sample).mean()
+            total_loss = total_loss + q_loss
+            loss_dict[f"loss/{q_name}"] = q_loss.detach().item()
+        loss_dict["loss/total"] = total_loss.detach().item()
+        return total_loss, loss_dict
+class MSEOnlyLoss(nn.Module):
+    """
+    Hierarchical MSE loss without KL term. Used as ablation baseline.
+    Equivalent to HierarchicalLoss with lambda_kl=0.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.epsilon         = float(cfg.loss.epsilon)
+        self.question_slices = [(q, s, e) for q, (s, e) in QUESTION_GROUPS.items()]
+    def forward(self, predictions: torch.Tensor,
+                targets: torch.Tensor, weights: torch.Tensor):
+        total_loss = torch.zeros(1, device=predictions.device, dtype=predictions.dtype)
+        loss_dict  = {}
+        for q_idx, (q_name, start, end) in enumerate(self.question_slices):
+            logits_q = predictions[:, start:end]
+            target_q = targets[:,    start:end]
+            weight_q = weights[:, q_idx]
+            pred_q         = F.softmax(logits_q, dim=-1)
+            mse_per_sample = F.mse_loss(pred_q, target_q, reduction="none").mean(dim=-1)
+            q_loss         = (weight_q * mse_per_sample).mean()
+            total_loss = total_loss + q_loss
+            loss_dict[f"loss/{q_name}"] = q_loss.detach().item()
+        loss_dict["loss/total"] = total_loss.detach().item()
+        return total_loss, loss_dict

src/metrics.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""
+src/metrics.py
+--------------
+Evaluation metrics for hierarchical probabilistic vote-fraction regression
+on Galaxy Zoo 2.
+Three evaluation regimes
+------------------------
+1. GLOBAL — all test samples (dominated by root question t01).
+2. REACHED-BRANCH — samples where branch was actually reached (w >= threshold).
+   This is the scientifically correct regime for conditional questions.
+3. ECE — Expected Calibration Error using adaptive (equal-frequency) bins.
+Fixes applied vs. original
+---------------------------
+- ECE uses adaptive binning (equal-frequency bins) instead of equal-width.
+  Equal-width bins saturate at 0.200 for bimodal questions (t02, t03, t04)
+  where predictions cluster near 0 and 1. Adaptive bins are unbiased for
+  any distribution shape.
+- simplex_violation_rate() added: fraction of question groups where the
+  sigmoid baseline predictions do not sum to 1 ± 0.02. Used to explain
+  why ResNet-18 + sigmoid achieves lower raw MAE despite predicting
+  invalid distributions.
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from src.dataset import QUESTION_GROUPS
+WEIGHT_THRESHOLDS = [0.05, 0.50, 0.75]
+# ─────────────────────────────────────────────────────────────
+# Main metrics function
+# ─────────────────────────────────────────────────────────────
+def compute_metrics(
+    all_predictions: np.ndarray,   # [N, 37]
+    all_targets:     np.ndarray,   # [N, 37]
+    all_weights:     np.ndarray,   # [N, 11]
+) -> dict:
+    """
+    Full metrics suite: global + reached-branch MAE/RMSE + bias + ECE.
+    """
+    metrics = {}
+    q_names = list(QUESTION_GROUPS.keys())
+    # ── 1. Global metrics ──────────────────────────────────────
+    mae_values  = []
+    rmse_values = []
+    weight_means = []
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        pred_q   = all_predictions[:, start:end]
+        target_q = all_targets[:,    start:end]
+        weight_q = all_weights[:, q_idx]
+        mae_q  = np.abs(pred_q - target_q).mean(axis=1).mean()
+        rmse_q = np.sqrt(((pred_q - target_q) ** 2).mean(axis=1).mean())
+        w_mean = weight_q.mean()
+        metrics[f"mae/{q_name}"]  = float(mae_q)
+        metrics[f"rmse/{q_name}"] = float(rmse_q)
+        metrics[f"bias/{q_name}"] = float(
+            (all_predictions[:, start:end] - all_targets[:, start:end]).mean()
+        )
+        mae_values.append(mae_q)
+        rmse_values.append(rmse_q)
+        weight_means.append(w_mean)
+    weight_means = np.array(weight_means)
+    weight_sum   = weight_means.sum()
+    metrics["mae/weighted_avg"]  = float(
+        (weight_means * np.array(mae_values)).sum()  / weight_sum
+    )
+    metrics["rmse/weighted_avg"] = float(
+        (weight_means * np.array(rmse_values)).sum() / weight_sum
+    )
+    # ── 2. Reached-branch metrics ──────────────────────────────
+    for thresh in WEIGHT_THRESHOLDS:
+        thresh_key  = str(thresh).replace(".", "")
+        branch_maes = []
+        branch_ws   = []
+        for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+            pred_q   = all_predictions[:, start:end]
+            target_q = all_targets[:,    start:end]
+            weight_q = all_weights[:, q_idx]
+            mask     = weight_q >= thresh
+            n_reached = mask.sum()
+            metrics[f"n_reached_w{thresh_key}/{q_name}"] = int(n_reached)
+            if n_reached >= 10:
+                mae_q = np.abs(pred_q[mask] - target_q[mask]).mean(axis=1).mean()
+                metrics[f"mae_w{thresh_key}/{q_name}"] = float(mae_q)
+                branch_maes.append(mae_q)
+                branch_ws.append(weight_q[mask].mean())
+            else:
+                metrics[f"mae_w{thresh_key}/{q_name}"] = float("nan")
+        if branch_maes:
+            bw = np.array(branch_ws)
+            bm = np.array(branch_maes)
+            metrics[f"mae_w{thresh_key}/conditional_avg"] = float(
+                (bw * bm).sum() / bw.sum()
+            )
+    # ── 3. ECE per question (adaptive binning) ─────────────────
+    ece_values = []
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        pred_flat   = all_predictions[:, start:end].flatten()
+        target_flat = all_targets[:,    start:end].flatten()
+        ece         = _compute_ece(pred_flat, target_flat)
+        metrics[f"ece/{q_name}"] = float(ece)
+        ece_values.append(ece)
+    metrics["ece/mean"] = float(np.nanmean(ece_values))
+    return metrics
+# ──────────────────────���──────────────────────────────────────
+# ECE — adaptive (equal-frequency) binning
+# ─────────────────────────────────────────────────────────────
+def _compute_ece(pred: np.ndarray, target: np.ndarray,
+                 n_bins: int = 15) -> float:
+    """
+    Expected Calibration Error with adaptive (equal-frequency) binning.
+    Equal-width binning saturates for bimodal distributions (e.g. t02, t03,
+    t04 where predictions cluster at 0 and 1) because >95% of samples fall
+    into boundary bins. Adaptive binning places bin edges at percentiles of
+    the predicted distribution, giving each bin an equal number of samples
+    and making ECE meaningful regardless of the prediction distribution shape.
+    Parameters
+    ----------
+    pred   : [N] predicted vote fractions
+    target : [N] true vote fractions
+    n_bins : number of bins (default 15)
+    Returns
+    -------
+    ECE : float in [0, 1]
+    """
+    if len(pred) < n_bins:
+        return float("nan")
+    # Build equal-frequency bin edges from percentiles of pred
+    percentiles = np.linspace(0, 100, n_bins + 1)
+    bin_edges   = np.unique(np.percentile(pred, percentiles))
+    if len(bin_edges) < 2:
+        return float("nan")
+    # Assign samples to bins (digitize returns 1-indexed; clip to [0, n-2])
+    bin_ids = np.clip(np.digitize(pred, bin_edges[1:-1]), 0, len(bin_edges) - 2)
+    ece = 0.0
+    n   = len(pred)
+    for b in np.unique(bin_ids):
+        mask = bin_ids == b
+        if not mask.any():
+            continue
+        ece += (mask.sum() / n) * abs(pred[mask].mean() - target[mask].mean())
+    return float(ece)
+# ─────────────────────────────────────────────────────────────
+# Simplex violation rate
+# ─────────────────────────────────────────────────────────────
+def simplex_violation_rate(
+    predictions: np.ndarray,   # [N, 37]
+    tolerance:   float = 0.02,
+) -> dict:
+    """
+    Compute the fraction of galaxies for which each question's predictions
+    do NOT sum to 1 ± tolerance. Used to demonstrate that the sigmoid
+    baseline produces invalid probability distributions.
+    A model trained with softmax per question group will have violation_rate
+    ≈ 0.0 by construction. A sigmoid baseline will have nonzero rates,
+    explaining why its raw per-answer MAE is lower (unconstrained outputs
+    can fit each marginal independently).
+    Parameters
+    ----------
+    predictions : [N, 37] array of predicted values
+    tolerance   : acceptable deviation from 1.0 (default 0.02)
+    Returns
+    -------
+    dict mapping question name to violation rate in [0, 1]
+    """
+    rates = {}
+    for q_name, (start, end) in QUESTION_GROUPS.items():
+        pred_q   = predictions[:, start:end]
+        row_sums = pred_q.sum(axis=1)
+        violated = np.abs(row_sums - 1.0) > tolerance
+        rates[q_name] = float(violated.mean())
+    rates["mean"] = float(np.mean(list(rates.values())))
+    return rates
+# ─────────────────────────────────────────────────────────────
+# Reached-branch comparison table (for paper Table 2)
+# ─────────────────────────────────────────────────────────────
+def compute_reached_branch_mae_table(
+    model_results: dict,
+) -> "pd.DataFrame":
+    """
+    Build the reached-branch MAE comparison table across all models.
+    Parameters
+    ----------
+    model_results : dict mapping model_name → (preds, targets, weights)
+                    All arrays are [N, 37] or [N, 11].
+    Returns
+    -------
+    pd.DataFrame with columns:
+        model, question, description, n_w005, mae_w005, mae_w050, mae_w075
+    """
+    import pandas as pd
+    QUESTION_DESCRIPTIONS = {
+        "t01": "Smooth or features",
+        "t02": "Edge-on disk",
+        "t03": "Bar",
+        "t04": "Spiral arms",
+        "t05": "Bulge prominence",
+        "t06": "Odd feature",
+        "t07": "Roundedness (smooth)",
+        "t08": "Odd feature type",
+        "t09": "Bulge shape (edge-on)",
+        "t10": "Arms winding",
+        "t11": "Arms number",
+    }
+    rows = []
+    for model_name, (preds, targets, weights) in model_results.items():
+        for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+            pred_q   = preds[:,   start:end]
+            target_q = targets[:, start:end]
+            weight_q = weights[:, q_idx]
+            row = {
+                "model"      : model_name,
+                "question"   : q_name,
+                "description": QUESTION_DESCRIPTIONS[q_name],
+            }
+            for thresh in WEIGHT_THRESHOLDS:
+                mask  = weight_q >= thresh
+                n     = mask.sum()
+                key   = f"n_w{str(thresh).replace('.','')}"
+                mkey  = f"mae_w{str(thresh).replace('.','')}"
+                row[key]  = int(n)
+                row[mkey] = (
+                    float(np.abs(pred_q[mask] - target_q[mask]).mean(axis=1).mean())
+                    if n >= 10 else float("nan")
+                )
+            rows.append(row)
+        # Weighted-average row for this model
+        branch_maes = []
+        branch_ws   = []
+        for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+            weight_q = weights[:, q_idx]
+            pred_q   = preds[:,   start:end]
+            target_q = targets[:, start:end]
+            mask     = weight_q >= 0.05
+            if mask.sum() >= 10:
+                branch_maes.append(
+                    np.abs(pred_q[mask] - target_q[mask]).mean(axis=1).mean()
+                )
+                branch_ws.append(weight_q[mask].mean())
+        bw = np.array(branch_ws)
+        bm = np.array(branch_maes)
+        rows.append({
+            "model"      : model_name,
+            "question"   : "weighted_avg",
+            "description": "Weighted average (w≥0.05)",
+            "n_w005"     : int(sum(weights[:, q] >= 0.05 for q in range(11)).sum()
+                               if hasattr(weights, "__len__") else 0),
+            "mae_w005"   : float((bw * bm).sum() / bw.sum()) if len(bw) > 0 else float("nan"),
+            "mae_w050"   : float("nan"),
+            "mae_w075"   : float("nan"),
+        })
+    return pd.DataFrame(rows)
+# ─────────────────────────────────────────────────────────────
+# Tensor → numpy helpers
+# ─────────────────────────────────────────────────────────────
+def predictions_to_numpy(
+    predictions: torch.Tensor,
+    targets:     torch.Tensor,
+    weights:     torch.Tensor,
+) -> tuple:
+    """Apply softmax per question group and return numpy arrays."""
+    pred_np = predictions.detach().cpu().clone()
+    for q_name, (start, end) in QUESTION_GROUPS.items():
+        pred_np[:, start:end] = F.softmax(pred_np[:, start:end], dim=-1)
+    return (
+        pred_np.numpy(),
+        targets.detach().cpu().numpy(),
+        weights.detach().cpu().numpy(),
+    )
+def dirichlet_predictions_to_numpy(
+    alpha:   torch.Tensor,
+    targets: torch.Tensor,
+    weights: torch.Tensor,
+) -> tuple:
+    """Convert Dirichlet concentration parameters to mean predictions."""
+    means = torch.zeros_like(alpha)
+    for q_name, (start, end) in QUESTION_GROUPS.items():
+        a_q = alpha[:, start:end]
+        means[:, start:end] = a_q / a_q.sum(dim=-1, keepdim=True)
+    return (
+        means.detach().cpu().numpy(),
+        targets.detach().cpu().numpy(),
+        weights.detach().cpu().numpy(),
+    )

src/model.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+src/model.py
+------------
+Vision Transformer (ViT-Base/16) backbone with three head variants:
+    1. GalaxyViT          — linear regression head (37 logits). Proposed model.
+    2. GalaxyViTDirichlet — Dirichlet concentration head (Zoobot-style baseline).
+    3. mc_dropout_predict — MC Dropout uncertainty estimation wrapper.
+Architecture
+------------
+Backbone  : vit_base_patch16_224 from timm (pretrained ImageNet-21k)
+            12 transformer layers, 12 heads, embed_dim=768
+            Input  : [B, 3, 224, 224]
+            CLS out: [B, 768]
+Head      : Dropout(p) → Linear(768, 37)
+Full multi-layer attention rollout
+------------------------------------
+All 12 transformer blocks use fused_attn=False so forward hooks can
+capture the post-softmax attention matrices. Rollout is computed in
+attention_viz.py using the corrected right-multiplication order.
+MC Dropout
+-----------
+enable_mc_dropout() keeps Dropout active at inference time.
+Running N stochastic forward passes gives mean prediction and
+per-answer std (epistemic uncertainty). N=30 is standard practice
+per Gal & Ghahramani (2016).
+Dirichlet head
+--------------
+Outputs α > 1 per answer via:  α = 1 + softplus(linear(features))
+Matches the Zoobot approach for a fair direct comparison.
+Mean vote fraction: E[p_q] = α_q / sum(α_q).
+References
+----------
+Gal & Ghahramani (2016). Dropout as a Bayesian Approximation.
+    ICML 2016. https://arxiv.org/abs/1506.02142
+Walmsley et al. (2022). Towards Galaxy Foundation Models.
+    MNRAS 509, 3966. https://arxiv.org/abs/2110.12735
+"""
+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+import numpy as np
+from omegaconf import DictConfig
+from typing import Optional, List, Tuple
+from src.dataset import QUESTION_GROUPS
+# ─────────────────────────────────────────────────────────────
+# Attention hook manager
+# ─────────────────────────────────────────────────────────────
+class AttentionHookManager:
+    """
+    Registers forward hooks on all transformer blocks to capture
+    post-softmax attention matrices for full rollout computation.
+    With fused_attn=False, timm's attention block executes:
+        attn = softmax(q @ k.T / scale)   # [B, H, N+1, N+1]
+        attn = attn_drop(attn)            # hook fires on INPUT = post-softmax
+        out  = attn @ v
+    """
+    def __init__(self, blocks):
+        self.blocks     = blocks
+        self._attn_list: List[torch.Tensor] = []
+        self._handles   = []
+        self._register_hooks()
+    def _register_hooks(self):
+        for block in self.blocks:
+            block.attn.fused_attn = False
+            def _make_hook():
+                def _hook(module, input, output):
+                    # input[0] is the post-softmax attention tensor
+                    self._attn_list.append(input[0].detach())
+                return _hook
+            h = block.attn.attn_drop.register_forward_hook(_make_hook())
+            self._handles.append(h)
+    def clear(self):
+        self._attn_list.clear()
+    def get_all_attentions(self) -> Optional[List[torch.Tensor]]:
+        """Returns list of L tensors, each [B, H, N+1, N+1]."""
+        if not self._attn_list:
+            return None
+        return list(self._attn_list)
+    def get_last_attention(self) -> Optional[torch.Tensor]:
+        if not self._attn_list:
+            return None
+        return self._attn_list[-1]
+    def remove_all(self):
+        for h in self._handles:
+            h.remove()
+        self._handles.clear()
+# ─────────────────────────────────────────────────────────────
+# GalaxyViT — proposed model
+# ─────────────────────────────────────────────────────────────
+class GalaxyViT(nn.Module):
+    """
+    ViT-Base/16 backbone + linear regression head for GZ2.
+    Outputs 37 raw logits; softmax is applied per question group
+    during loss computation and metric evaluation.
+    Full 12-layer attention hooks are registered at construction.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.backbone = timm.create_model(
+            cfg.model.backbone,
+            pretrained=cfg.model.pretrained,
+            num_classes=0,
+        )
+        embed_dim = self.backbone.embed_dim  # 768
+        self.head = nn.Sequential(
+            nn.Dropout(p=cfg.model.dropout),
+            nn.Linear(embed_dim, 37),
+        )
+        self._hook_mgr   = AttentionHookManager(self.backbone.blocks)
+        self._mc_dropout = False
+    def enable_mc_dropout(self):
+        """Keep Dropout active at inference time for MC sampling."""
+        self._mc_dropout = True
+        for m in self.modules():
+            if isinstance(m, nn.Dropout):
+                m.train()
+    def disable_mc_dropout(self):
+        self._mc_dropout = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self._hook_mgr.clear()
+        features = self.backbone(x)      # [B, 768]
+        logits   = self.head(features)   # [B, 37]
+        return logits
+    def get_attention_weights(self) -> Optional[torch.Tensor]:
+        return self._hook_mgr.get_last_attention()
+    def get_all_attention_weights(self) -> Optional[List[torch.Tensor]]:
+        return self._hook_mgr.get_all_attentions()
+    def remove_hooks(self):
+        self._hook_mgr.remove_all()
+# ─────────────────────────────────────────────────────────────
+# GalaxyViTDirichlet — Zoobot-style comparison baseline
+# ─────────────────────────────────────────────────────────────
+class GalaxyViTDirichlet(nn.Module):
+    """
+    ViT-Base/16 + Dirichlet concentration head.
+    Outputs α > 1 per answer via α = 1 + softplus(linear(features)).
+    Enforcing α > 1 ensures unimodal Dirichlet distributions.
+    Mean vote fraction: E[p_q] = α_q / sum(α_q)  (same as softmax mean).
+    Total concentration sum(α_q) encodes prediction confidence.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.backbone = timm.create_model(
+            cfg.model.backbone,
+            pretrained=cfg.model.pretrained,
+            num_classes=0,
+        )
+        embed_dim = self.backbone.embed_dim
+        self.head = nn.Sequential(
+            nn.Dropout(p=cfg.model.dropout),
+            nn.Linear(embed_dim, 37),
+        )
+        self._hook_mgr = AttentionHookManager(self.backbone.blocks)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Returns α: [B, 37] Dirichlet concentration parameters > 1."""
+        self._hook_mgr.clear()
+        features = self.backbone(x)
+        logits   = self.head(features)
+        alpha    = 1.0 + F.softplus(logits)   # α > 1
+        return alpha
+    def get_mean_prediction(self, alpha: torch.Tensor) -> torch.Tensor:
+        means = torch.zeros_like(alpha)
+        for q_name, (start, end) in QUESTION_GROUPS.items():
+            a_q = alpha[:, start:end]
+            means[:, start:end] = a_q / a_q.sum(dim=-1, keepdim=True)
+        return means
+    def get_attention_weights(self):
+        return self._hook_mgr.get_last_attention()
+    def get_all_attention_weights(self):
+        return self._hook_mgr.get_all_attentions()
+# ─────────────────────────────────────────────────────────────
+# MC Dropout inference
+# ─────────────────────────────────────────────────────────────
+@torch.no_grad()
+def mc_dropout_predict(
+    model:    GalaxyViT,
+    images:   torch.Tensor,
+    n_passes: int = 30,
+    device:   torch.device = None,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    MC Dropout epistemic uncertainty estimation.
+    Runs n_passes stochastic forward passes with dropout active,
+    returning mean prediction and per-answer std.
+    Parameters
+    ----------
+    model    : GalaxyViT instance
+    images   : [B, 3, H, W]
+    n_passes : number of MC samples (30 is standard)
+    device   : inference device
+    Returns
+    -------
+    mean_pred       : [B, 37] mean softmax predictions
+    std_pred        : [B, 37] std across passes (epistemic uncertainty)
+    per_q_uncertainty: [B, 11] mean std per question
+    """
+    if device is None:
+        device = next(model.parameters()).device
+    model.eval()
+    model.enable_mc_dropout()
+    images    = images.to(device)
+    all_preds = []
+    for _ in range(n_passes):
+        logits = model(images)          # [B, 37]
+        preds  = torch.zeros_like(logits)
+        for q_name, (start, end) in QUESTION_GROUPS.items():
+            preds[:, start:end] = F.softmax(logits[:, start:end], dim=-1)
+        all_preds.append(preds.cpu().numpy())
+    model.disable_mc_dropout()
+    all_preds = np.stack(all_preds, axis=0)    # [n_passes, B, 37]
+    mean_pred = all_preds.mean(axis=0)          # [B, 37]
+    std_pred  = all_preds.std(axis=0)           # [B, 37]
+    q_names   = list(QUESTION_GROUPS.keys())
+    per_q_unc = np.zeros(
+        (mean_pred.shape[0], len(q_names)), dtype=np.float32
+    )
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        per_q_unc[:, q_idx] = std_pred[:, start:end].mean(axis=1)
+    return (
+        mean_pred.astype(np.float32),
+        std_pred.astype(np.float32),
+        per_q_unc,
+    )
+# ─────────────────────────────────────────────────────────────
+# Factory functions
+# ─────────────────────────────────────────────────────────────
+def build_model(cfg: DictConfig) -> GalaxyViT:
+    model = GalaxyViT(cfg)
+    _print_summary(model, cfg, "GalaxyViT (regression — proposed)")
+    return model
+def build_dirichlet_model(cfg: DictConfig) -> GalaxyViTDirichlet:
+    model = GalaxyViTDirichlet(cfg)
+    _print_summary(model, cfg, "GalaxyViTDirichlet (Zoobot-style baseline)")
+    return model
+def _print_summary(model, cfg, name: str):
+    total     = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    n_hooks   = len(model.backbone.blocks)
+    print(f"\n{'='*55}")
+    print(f"Model      : {name}")
+    print(f"Backbone   : {cfg.model.backbone}")
+    print(f"Pretrained : {cfg.model.pretrained}")
+    print(f"Dropout    : {cfg.model.dropout}")
+    print(f"Parameters : {total:,}  ({trainable:,} trainable)")
+    print(f"Attn hooks : {n_hooks} layers (full rollout enabled)")
+    print(f"{'='*55}\n")

src/train.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+src/train.py
+------------
+Main training loop for the proposed hierarchical probabilistic ViT
+regression model on Galaxy Zoo 2.
+Model    : GalaxyViT (ViT-Base/16 + linear head)
+Loss     : HierarchicalLoss (KL + MSE, λ=0.5 each)
+Scheduler: CosineAnnealingLR
+Dropout  : 0.3 (increased from 0.1 — see base.yaml rationale)
+Saves
+-----
+outputs/checkpoints/best_<experiment_name>.pt  — best checkpoint
+outputs/logs/training_<experiment_name>_history.csv  — epoch history
+Usage
+-----
+    cd ~/galaxy
+    nohup python -m src.train --config configs/full_train.yaml \
+        > outputs/logs/train_full.log 2>&1 &
+    echo "PID: $!"
+"""
+import argparse
+import logging
+import random
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.amp import autocast, GradScaler
+from omegaconf import OmegaConf
+import pandas as pd
+import wandb
+from tqdm import tqdm
+from src.dataset       import build_dataloaders
+from src.loss          import HierarchicalLoss
+from src.metrics       import compute_metrics, predictions_to_numpy
+from src.model         import build_model
+from src.attention_viz import plot_attention_grid
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s %(name)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("train")
+# ─────────────────────────────────────────────────────────────
+# Utilities
+# ─────────────────────────────────────────────────────────────
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark     = False
+class EarlyStopping:
+    def __init__(self, patience, min_delta, checkpoint_path):
+        self.patience        = patience
+        self.min_delta       = min_delta
+        self.checkpoint_path = checkpoint_path
+        self.best_loss       = float("inf")
+        self.counter         = 0
+        self.best_epoch      = 0
+    def step(self, val_loss, model, epoch) -> bool:
+        if val_loss < self.best_loss - self.min_delta:
+            self.best_loss  = val_loss
+            self.counter    = 0
+            self.best_epoch = epoch
+            torch.save(
+                {"epoch": epoch, "model_state": model.state_dict(),
+                 "val_loss": val_loss},
+                self.checkpoint_path,
+            )
+            log.info("  [ckpt] saved  epoch=%d  val_loss=%.6f", epoch, val_loss)
+        else:
+            self.counter += 1
+            log.info("  [early_stop] %d/%d  best=%.6f",
+                     self.counter, self.patience, self.best_loss)
+        return self.counter >= self.patience
+    def restore_best(self, model):
+        ckpt = torch.load(self.checkpoint_path, map_location="cpu",
+                          weights_only=True)
+        model.load_state_dict(ckpt["model_state"])
+        log.info("Restored best weights  epoch=%d  val_loss=%.6f",
+                 ckpt["epoch"], ckpt["val_loss"])
+# ─────────────────────────────────────────────────────────────
+# Training / validation steps
+# ─────────────────────────────────────────────────────────────
+def train_one_epoch(model, loader, loss_fn, optimizer,
+                    scaler, device, cfg, epoch):
+    model.train()
+    total = 0.0
+    nb    = 0
+    for images, targets, weights, _ in tqdm(
+        loader, desc=f"Train E{epoch}", leave=False
+    ):
+        images  = images.to(device,  non_blocking=True)
+        targets = targets.to(device, non_blocking=True)
+        weights = weights.to(device, non_blocking=True)
+        optimizer.zero_grad(set_to_none=True)
+        with autocast("cuda", enabled=cfg.training.mixed_precision):
+            logits      = model(images)
+            loss, _     = loss_fn(logits, targets, weights)
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.training.grad_clip)
+        scaler.step(optimizer)
+        scaler.update()
+        total += loss.item()
+        nb    += 1
+    return total / nb
+def validate(model, loader, loss_fn, device, cfg,
+             collect_attn=False, n_attn=8, epoch=0):
+    model.eval()
+    total = 0.0
+    nb    = 0
+    all_preds, all_targets, all_weights = [], [], []
+    attn_imgs, all_layers_list, attn_ids = [], [], []
+    attn_done = False
+    with torch.no_grad():
+        for images, targets, weights, image_ids in tqdm(
+            loader, desc=f"Val E{epoch}", leave=False
+        ):
+            images  = images.to(device,  non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+            weights = weights.to(device, non_blocking=True)
+            with autocast("cuda", enabled=cfg.training.mixed_precision):
+                logits      = model(images)
+                loss, _     = loss_fn(logits, targets, weights)
+            total += loss.item()
+            nb    += 1
+            p, t, w = predictions_to_numpy(logits, targets, weights)
+            all_preds.append(p)
+            all_targets.append(t)
+            all_weights.append(w)
+            if collect_attn and not attn_done:
+                all_layers = model.get_all_attention_weights()
+                if all_layers is not None:
+                    n = min(n_attn, images.shape[0])
+                    attn_imgs.append(images[:n].cpu())
+                    all_layers_list.append([l[:n].cpu() for l in all_layers])
+                    attn_ids.extend([int(i) for i in image_ids[:n]])
+                    if len(attn_ids) >= n_attn:
+                        attn_done = True
+    all_preds   = np.concatenate(all_preds)
+    all_targets = np.concatenate(all_targets)
+    all_weights = np.concatenate(all_weights)
+    metrics     = compute_metrics(all_preds, all_targets, all_weights)
+    val_logs = {"val/loss_total": total / nb}
+    val_logs.update({f"val/{k}": v for k, v in metrics.items()})
+    val_logs["val/reached_mae_w050"] = metrics.get("mae_w050/conditional_avg", 0)
+    attn_data = None
+    if collect_attn and attn_imgs:
+        attn_data = (
+            torch.cat(attn_imgs, dim=0),
+            [torch.cat([b[li] for b in all_layers_list], dim=0)
+             for li in range(len(all_layers_list[0]))],
+            attn_ids,
+        )
+    return val_logs, attn_data
+# ─────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    set_seed(cfg.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    log.info("Device: %s", device)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32       = True
+    Path(cfg.outputs.checkpoint_dir).mkdir(parents=True, exist_ok=True)
+    Path(cfg.outputs.figures_dir).mkdir(parents=True, exist_ok=True)
+    Path(cfg.outputs.log_dir).mkdir(parents=True, exist_ok=True)
+    checkpoint_path = str(
+        Path(cfg.outputs.checkpoint_dir) / f"best_{cfg.experiment_name}.pt"
+    )
+    history_path = str(
+        Path(cfg.outputs.log_dir) / f"training_{cfg.experiment_name}_history.csv"
+    )
+    if cfg.wandb.enabled:
+        wandb.init(
+            project=cfg.wandb.project,
+            name=cfg.experiment_name,
+            config=OmegaConf.to_container(cfg, resolve=True),
+        )
+    log.info("Building dataloaders...")
+    train_loader, val_loader, _ = build_dataloaders(cfg)
+    log.info("Building model...")
+    model   = build_model(cfg).to(device)
+    loss_fn = HierarchicalLoss(cfg)
+    optimizer = torch.optim.AdamW(
+        [
+            {"params": model.backbone.parameters(),
+             "lr": cfg.training.learning_rate * 0.1},
+            {"params": model.head.parameters(),
+             "lr": cfg.training.learning_rate},
+        ],
+        weight_decay=cfg.training.weight_decay,
+    )
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=cfg.scheduler.T_max, eta_min=cfg.scheduler.eta_min
+    )
+    scaler     = GradScaler("cuda")
+    early_stop = EarlyStopping(
+        patience        = cfg.early_stopping.patience,
+        min_delta       = cfg.early_stopping.min_delta,
+        checkpoint_path = checkpoint_path,
+    )
+    log.info("Starting training: %s", cfg.experiment_name)
+    history = []
+    for epoch in range(1, cfg.training.epochs + 1):
+        train_loss = train_one_epoch(
+            model, train_loader, loss_fn, optimizer, scaler, device, cfg, epoch
+        )
+        collect_attn = (epoch % cfg.wandb.log_attention_every_n_epochs == 0)
+        val_logs, attn_data = validate(
+            model, val_loader, loss_fn, device, cfg,
+            collect_attn=collect_attn,
+            n_attn=cfg.wandb.n_attention_samples,
+            epoch=epoch,
+        )
+        scheduler.step()
+        lr = scheduler.get_last_lr()[0]
+        val_mae  = val_logs.get("val/mae/weighted_avg", 0)
+        val_loss = val_logs["val/loss_total"]
+        reached  = val_logs.get("val/reached_mae_w050", 0)
+        log.info(
+            "Epoch %d  train=%.4f  val=%.4f  mae=%.4f  reached_mae=%.4f  lr=%.2e",
+            epoch, train_loss, val_loss, val_mae, reached, lr,
+        )
+        history.append({
+            "epoch"      : epoch,
+            "train_loss" : train_loss,
+            "val_loss"   : val_loss,
+            "val_mae"    : val_mae,
+            "reached_mae": reached,
+            "lr"         : lr,
+        })
+        if cfg.wandb.enabled:
+            log_dict = {
+                "train/loss": train_loss,
+                **val_logs,
+                "lr": lr, "epoch": epoch,
+            }
+            if attn_data is not None:
+                import matplotlib.pyplot as plt
+                imgs, layers, ids = attn_data
+                fig = plot_attention_grid(
+                    imgs, layers, ids,
+                    save_path=(
+                        f"{cfg.outputs.figures_dir}/{cfg.experiment_name}/"
+                        f"attn_epoch{epoch:03d}.png"
+                    ),
+                    n_cols=4, rollout_mode="full",
+                )
+                log_dict["attention/rollout_full"] = wandb.Image(fig)
+                plt.close(fig)
+            wandb.log(log_dict, step=epoch)
+        if early_stop.step(val_loss, model, epoch):
+            log.info("Early stopping at epoch %d  best=%d  loss=%.6f",
+                     epoch, early_stop.best_epoch, early_stop.best_loss)
+            break
+    # Save history
+    pd.DataFrame(history).to_csv(history_path, index=False)
+    log.info("Saved history: %s", history_path)
+    early_stop.restore_best(model)
+    if cfg.wandb.enabled:
+        wandb.finish()
+    log.info("Done. Best checkpoint: %s", checkpoint_path)
+if __name__ == "__main__":
+    main()

src/train_single.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""
+src/train_single.py
+-------------------
+Train any single model by name. Designed for running baselines
+one at a time with breaks between them.
+Available models
+----------------
+    proposed       — ViT-Base + hierarchical KL+MSE  (main model)
+    b1_resnet_mse  — ResNet-18 + independent MSE (sigmoid)
+    b2_resnet_kl   — ResNet-18 + hierarchical KL+MSE
+    b3_vit_mse     — ViT-Base  + hierarchical MSE only (no KL)
+    b4_vit_dir     — ViT-Base  + Dirichlet NLL (Zoobot-style)
+Usage
+-----
+    # Train proposed model
+    python -m src.train_single --model proposed --config configs/full_train.yaml
+    # Train one baseline at a time
+    python -m src.train_single --model b1_resnet_mse --config configs/full_train.yaml
+    python -m src.train_single --model b2_resnet_kl  --config configs/full_train.yaml
+    python -m src.train_single --model b3_vit_mse    --config configs/full_train.yaml
+    python -m src.train_single --model b4_vit_dir    --config configs/full_train.yaml
+    # With nohup (recommended)
+    nohup python -m src.train_single --model b3_vit_mse \\
+        --config configs/full_train.yaml \\
+        > outputs/logs/train_b3_vit_mse.log 2>&1 &
+    echo "PID: $!"
+Each model saves its checkpoint independently, so you can run them
+in any order and resume from any point. Already-trained models are
+detected by their checkpoint file and skipped unless --force is passed.
+"""
+import argparse
+import logging
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("train_single")
+# ── Checkpoint paths per model ─────────────────────────────────────────────────
+CHECKPOINT_NAMES = {
+    "proposed"      : "best_full_train.pt",
+    "b1_resnet_mse" : "baseline_resnet18_mse.pt",
+    "b2_resnet_kl"  : "baseline_resnet18_klmse.pt",
+    "b3_vit_mse"    : "baseline_vit_mse.pt",
+    "b4_vit_dir"    : "baseline_vit_dirichlet.pt",
+}
+# ── Human-readable labels ──────────────────────────────────────────────────────
+MODEL_LABELS = {
+    "proposed"      : "ViT-Base + hierarchical KL+MSE (proposed)",
+    "b1_resnet_mse" : "ResNet-18 + independent MSE (sigmoid, no hierarchy)",
+    "b2_resnet_kl"  : "ResNet-18 + hierarchical KL+MSE",
+    "b3_vit_mse"    : "ViT-Base + hierarchical MSE only (no KL)",
+    "b4_vit_dir"    : "ViT-Base + Dirichlet NLL (Zoobot-style)",
+}
+def train_proposed(cfg, device, ckpt_path):
+    """Train the proposed ViT + hierarchical KL+MSE model."""
+    from src.train import (
+        train_one_epoch, validate, EarlyStopping, set_seed
+    )
+    from src.dataset       import build_dataloaders
+    from src.model         import build_model
+    from src.loss          import HierarchicalLoss
+    from src.attention_viz import plot_attention_grid
+    import pandas as pd
+    import wandb
+    from torch.amp import GradScaler
+    import matplotlib.pyplot as plt
+    set_seed(cfg.seed)
+    log.info("Training: %s", MODEL_LABELS["proposed"])
+    Path(cfg.outputs.checkpoint_dir).mkdir(parents=True, exist_ok=True)
+    Path(cfg.outputs.figures_dir).mkdir(parents=True, exist_ok=True)
+    Path(cfg.outputs.log_dir).mkdir(parents=True, exist_ok=True)
+    history_path = str(
+        Path(cfg.outputs.log_dir) / "training_full_train_history.csv"
+    )
+    if cfg.wandb.enabled:
+        wandb.init(
+            project=cfg.wandb.project,
+            name=cfg.experiment_name,
+            config=OmegaConf.to_container(cfg, resolve=True),
+        )
+    train_loader, val_loader, _ = build_dataloaders(cfg)
+    model   = build_model(cfg).to(device)
+    loss_fn = HierarchicalLoss(cfg)
+    optimizer = torch.optim.AdamW(
+        [
+            {"params": model.backbone.parameters(),
+             "lr": cfg.training.learning_rate * 0.1},
+            {"params": model.head.parameters(),
+             "lr": cfg.training.learning_rate},
+        ],
+        weight_decay=cfg.training.weight_decay,
+    )
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=cfg.scheduler.T_max, eta_min=cfg.scheduler.eta_min
+    )
+    scaler     = GradScaler("cuda")
+    early_stop = EarlyStopping(
+        patience=cfg.early_stopping.patience,
+        min_delta=cfg.early_stopping.min_delta,
+        checkpoint_path=ckpt_path,
+    )
+    history = []
+    for epoch in range(1, cfg.training.epochs + 1):
+        train_loss = train_one_epoch(
+            model, train_loader, loss_fn, optimizer, scaler, device, cfg, epoch
+        )
+        collect_attn = (epoch % cfg.wandb.log_attention_every_n_epochs == 0)
+        val_logs, attn_data = validate(
+            model, val_loader, loss_fn, device, cfg,
+            collect_attn=collect_attn,
+            n_attn=cfg.wandb.n_attention_samples,
+            epoch=epoch,
+        )
+        scheduler.step()
+        lr = scheduler.get_last_lr()[0]
+        val_mae  = val_logs.get("val/mae/weighted_avg", 0)
+        val_loss = val_logs["val/loss_total"]
+        log.info("Epoch %d  train=%.4f  val=%.4f  mae=%.4f  lr=%.2e",
+                 epoch, train_loss, val_loss, val_mae, lr)
+        history.append({
+            "epoch": epoch, "train_loss": train_loss,
+            "val_loss": val_loss, "val_mae": val_mae, "lr": lr,
+        })
+        if cfg.wandb.enabled:
+            log_dict = {"train/loss": train_loss, **val_logs,
+                        "lr": lr, "epoch": epoch}
+            if attn_data is not None:
+                imgs, layers, ids = attn_data
+                fig = plot_attention_grid(
+                    imgs, layers, ids,
+                    save_path=(f"{cfg.outputs.figures_dir}/{cfg.experiment_name}/"
+                               f"attn_epoch{epoch:03d}.png"),
+                    n_cols=4, rollout_mode="full",
+                )
+                log_dict["attention/rollout_full"] = wandb.Image(fig)
+                plt.close(fig)
+            wandb.log(log_dict, step=epoch)
+        if early_stop.step(val_loss, model, epoch):
+            log.info("Early stopping at epoch %d", epoch)
+            break
+    pd.DataFrame(history).to_csv(history_path, index=False)
+    early_stop.restore_best(model)
+    if cfg.wandb.enabled:
+        wandb.finish()
+    log.info("Done. Checkpoint: %s", ckpt_path)
+def train_baseline(cfg, device, ckpt_path, model_key):
+    """Train any of the four baselines."""
+    import wandb
+    from torch.amp import GradScaler
+    from src.dataset  import build_dataloaders
+    from src.model    import build_model, build_dirichlet_model
+    from src.loss     import HierarchicalLoss, DirichletLoss, MSEOnlyLoss
+    from src.metrics  import (compute_metrics, predictions_to_numpy,
+                               dirichlet_predictions_to_numpy)
+    from src.baselines import (
+        ResNet18Baseline, IndependentMSELoss, EarlyStopping,
+        set_seed, _train_epoch, _val_epoch,
+        _train_epoch_dirichlet, _val_epoch_dirichlet,
+    )
+    import pandas as pd
+    from omegaconf import OmegaConf as OC
+    set_seed(cfg.seed)
+    log.info("Training: %s", MODEL_LABELS[model_key])
+    Path(cfg.outputs.checkpoint_dir).mkdir(parents=True, exist_ok=True)
+    # ── Build model and loss ───────────────────────────────────
+    if model_key == "b1_resnet_mse":
+        model   = ResNet18Baseline(dropout=cfg.model.dropout).to(device)
+        loss_fn = IndependentMSELoss()
+        use_sigmoid      = True
+        is_dirichlet     = False
+        use_layerwise_lr = False
+        wandb_name       = "B1-ResNet18-MSE"
+    elif model_key == "b2_resnet_kl":
+        model   = ResNet18Baseline(dropout=cfg.model.dropout).to(device)
+        loss_fn = HierarchicalLoss(cfg)
+        use_sigmoid      = False
+        is_dirichlet     = False
+        use_layerwise_lr = False
+        wandb_name       = "B2-ResNet18-KL+MSE"
+    elif model_key == "b3_vit_mse":
+        vit_mse_cfg = OC.merge(
+            cfg, OC.create({"loss": {"lambda_kl": 0.0, "lambda_mse": 1.0}})
+        )
+        model   = build_model(vit_mse_cfg).to(device)
+        loss_fn = MSEOnlyLoss(vit_mse_cfg)
+        cfg     = vit_mse_cfg   # use updated cfg for optimizer
+        use_sigmoid      = False
+        is_dirichlet     = False
+        use_layerwise_lr = True
+        wandb_name       = "B3-ViT-MSE"
+    elif model_key == "b4_vit_dir":
+        model   = build_dirichlet_model(cfg).to(device)
+        loss_fn = DirichletLoss(cfg)
+        use_sigmoid      = False
+        is_dirichlet     = True
+        use_layerwise_lr = True
+        wandb_name       = "B4-ViT-Dirichlet"
+    else:
+        raise ValueError(f"Unknown model key: {model_key}")
+    total = sum(p.numel() for p in model.parameters())
+    log.info("Parameters: %s", f"{total:,}")
+    # ── Optimizer ──────────────────────────────────────────────
+    if use_layerwise_lr and hasattr(model, "backbone") and hasattr(model, "head"):
+        optimizer = torch.optim.AdamW(
+            [
+                {"params": model.backbone.parameters(),
+                 "lr": cfg.training.learning_rate * 0.1},
+                {"params": model.head.parameters(),
+                 "lr": cfg.training.learning_rate},
+            ],
+            weight_decay=cfg.training.weight_decay,
+        )
+    else:
+        optimizer = torch.optim.AdamW(
+            model.parameters(),
+            lr=cfg.training.learning_rate,
+            weight_decay=cfg.training.weight_decay,
+        )
+    scheduler  = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=cfg.scheduler.T_max, eta_min=cfg.scheduler.eta_min
+    )
+    scaler     = GradScaler("cuda")
+    early_stop = EarlyStopping(
+        patience=cfg.early_stopping.patience,
+        min_delta=cfg.early_stopping.min_delta,
+        checkpoint_path=ckpt_path,
+    )
+    train_loader, val_loader, test_loader = build_dataloaders(cfg)
+    wandb.init(
+        project=cfg.wandb.project, name=wandb_name,
+        config={"model": wandb_name, "seed": cfg.seed,
+                "epochs": cfg.training.epochs,
+                "lambda_kl": cfg.loss.lambda_kl},
+        reinit=True,
+    )
+    # ── Training loop ──────────────────────────────────────────
+    history = []
+    for epoch in range(1, cfg.training.epochs + 1):
+        if is_dirichlet:
+            train_loss = _train_epoch_dirichlet(
+                model, train_loader, loss_fn, optimizer, scaler,
+                device, cfg, epoch, wandb_name
+            )
+            val_loss, val_metrics = _val_epoch_dirichlet(
+                model, val_loader, loss_fn, device, cfg, epoch, wandb_name
+            )
+        else:
+            train_loss = _train_epoch(
+                model, train_loader, loss_fn, optimizer, scaler,
+                device, cfg, epoch, wandb_name
+            )
+            val_loss, val_metrics = _val_epoch(
+                model, val_loader, loss_fn, device, cfg, epoch, wandb_name,
+                use_sigmoid=use_sigmoid
+            )
+        scheduler.step()
+        lr      = scheduler.get_last_lr()[0]
+        val_mae = val_metrics.get("mae/weighted_avg", 0)
+        log.info("%s  epoch=%d  train=%.4f  val=%.4f  mae=%.4f  lr=%.2e",
+                 wandb_name, epoch, train_loss, val_loss, val_mae, lr)
+        history.append({
+            "epoch": epoch, "train_loss": train_loss,
+            "val_loss": val_loss, "val_mae": val_mae,
+        })
+        wandb.log({
+            "train_loss": train_loss, "val_loss": val_loss,
+            "val_mae": val_mae, "lr": lr,
+        }, step=epoch)
+        if early_stop.step(val_loss, model, epoch):
+            log.info("%s: early stopping at epoch %d", wandb_name, epoch)
+            break
+    best_val = early_stop.restore_best(model)
+    wandb.finish()
+    # ── Test evaluation ────────────────────────────────────────
+    log.info("Evaluating on test set...")
+    if is_dirichlet:
+        _, test_metrics = _val_epoch_dirichlet(
+            model, test_loader, loss_fn, device, cfg,
+            epoch=0, label=f"{wandb_name}-test"
+        )
+    else:
+        _, test_metrics = _val_epoch(
+            model, test_loader, loss_fn, device, cfg,
+            epoch=0, label=f"{wandb_name}-test", use_sigmoid=use_sigmoid
+        )
+    log.info("%s — Test MAE=%.5f  RMSE=%.5f",
+             wandb_name,
+             test_metrics["mae/weighted_avg"],
+             test_metrics["rmse/weighted_avg"])
+    # ── Save per-model history ─────────────────────────────────
+    hist_path = Path(cfg.outputs.log_dir) / f"training_{model_key}_history.csv"
+    pd.DataFrame(history).to_csv(hist_path, index=False)
+    log.info("History saved: %s", hist_path)
+    log.info("Done. Checkpoint: %s", ckpt_path)
+    return test_metrics, best_val, early_stop.best_epoch
+# ─────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train a single model. Run multiple times to train "
+                    "different models with breaks in between."
+    )
+    parser.add_argument(
+        "--model",
+        required=True,
+        choices=list(CHECKPOINT_NAMES.keys()),
+        help=(
+            "Which model to train:\n"
+            "  proposed      — ViT-Base + hierarchical KL+MSE (main)\n"
+            "  b1_resnet_mse — ResNet-18 + independent MSE (sigmoid)\n"
+            "  b2_resnet_kl  — ResNet-18 + hierarchical KL+MSE\n"
+            "  b3_vit_mse    — ViT-Base + hierarchical MSE only\n"
+            "  b4_vit_dir    — ViT-Base + Dirichlet NLL\n"
+        ),
+    )
+    parser.add_argument("--config",   required=True)
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Retrain even if checkpoint already exists.",
+    )
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    ckpt_dir = Path(cfg.outputs.checkpoint_dir)
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    Path(cfg.outputs.log_dir).mkdir(parents=True, exist_ok=True)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32       = True
+    ckpt_path = str(ckpt_dir / CHECKPOINT_NAMES[args.model])
+    # ── Skip if already done ───────────────────────────────────
+    if Path(ckpt_path).exists() and not args.force:
+        log.info("Checkpoint already exists: %s", ckpt_path)
+        log.info("Model '%s' is already trained. Skipping.", args.model)
+        log.info("Use --force to retrain.")
+        return
+    log.info("=" * 60)
+    log.info("Training: %s", MODEL_LABELS[args.model])
+    log.info("Device  : %s", device)
+    log.info("Config  : %s", args.config)
+    log.info("Ckpt    : %s", ckpt_path)
+    log.info("=" * 60)
+    if args.model == "proposed":
+        train_proposed(cfg, device, ckpt_path)
+    else:
+        train_baseline(cfg, device, ckpt_path, args.model)
+    log.info("=" * 60)
+    log.info("FINISHED: %s", MODEL_LABELS[args.model])
+    log.info("=" * 60)
+if __name__ == "__main__":
+    main()

src/uncertainty_analysis.py ADDED Viewed

	@@ -0,0 +1,578 @@

+"""
+src/uncertainty_analysis.py
+----------------------------
+MC Dropout epistemic uncertainty analysis for the proposed model.
+MC Dropout (Gal & Ghahramani 2016) is used as a post-hoc uncertainty
+estimator. At inference time, dropout is kept active and N=30 stochastic
+forward passes are run per batch. The standard deviation across passes
+is used as the epistemic uncertainty estimate per galaxy per question.
+Key findings reported
+---------------------
+1. Uncertainty distributions: right-skewed, well-separated means across
+   questions reflecting the conditional nature of the decision tree.
+2. Uncertainty vs. error correlation: Spearman ρ reported per question.
+   Strong positive correlation for root and shallow-branch questions
+   (t01, t02, t04, t07) indicates the model is well-calibrated in
+   uncertainty. Weak or near-zero correlation for deep conditional
+   branches (t03, t05, t08, t09, t10, t11) is expected — these branches
+   have small effective sample sizes and aleatoric uncertainty dominates.
+3. Morphology selection benchmark: F1 score at threshold τ for downstream
+   binary morphology classification tasks.
+Output files
+------------
+outputs/figures/uncertainty/
+  fig_uncertainty_distributions.pdf
+  fig_uncertainty_vs_error.pdf
+  fig_morphology_f1_comparison.pdf
+  table_uncertainty_summary.csv
+  table_morphology_selection_benchmark.csv
+  mc_cache/                         — cached numpy arrays (crash-safe)
+Usage
+-----
+    cd ~/galaxy
+    nohup python -m src.uncertainty_analysis \
+        --config configs/full_train.yaml --n_passes 30 \
+        > outputs/logs/uncertainty.log 2>&1 &
+    echo "PID: $!"
+"""
+import argparse
+import logging
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from scipy import stats as scipy_stats
+from torch.amp import autocast
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from src.dataset   import build_dataloaders, QUESTION_GROUPS
+from src.model     import build_model, build_dirichlet_model
+from src.baselines import ResNet18Baseline
+from src.metrics   import predictions_to_numpy, dirichlet_predictions_to_numpy
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
+)
+log = logging.getLogger("uncertainty")
+plt.rcParams.update({
+    "figure.dpi": 150, "savefig.dpi": 300,
+    "font.family": "serif", "font.size": 11,
+    "axes.titlesize": 10, "axes.labelsize": 10,
+    "xtick.labelsize": 8, "ytick.labelsize": 8,
+    "legend.fontsize": 8,
+    "figure.facecolor": "white", "axes.facecolor": "white",
+    "axes.grid": True, "grid.alpha": 0.3,
+    "pdf.fonttype": 42, "ps.fonttype": 42,
+})
+QUESTION_LABELS = {
+    "t01": "Smooth or features", "t02": "Edge-on disk",
+    "t03": "Bar",                 "t04": "Spiral arms",
+    "t05": "Bulge prominence",    "t06": "Odd feature",
+    "t07": "Roundedness",         "t08": "Odd feature type",
+    "t09": "Bulge shape",         "t10": "Arms winding",
+    "t11": "Arms number",
+}
+MODEL_COLORS = {
+    "ViT-Base + KL+MSE (proposed)"      : "#27ae60",
+    "ViT-Base + Dirichlet (Zoobot-style)": "#8e44ad",
+    "ResNet-18 + MSE (sigmoid)"          : "#c0392b",
+    "ResNet-18 + KL+MSE"                 : "#e67e22",
+}
+SELECTION_THRESHOLDS = [0.5, 0.7, 0.8, 0.9]
+SELECTION_ANSWERS = {
+    "t01": (0, "smooth"),
+    "t02": (0, "edge-on"),
+    "t03": (0, "bar"),
+    "t04": (0, "spiral"),
+    "t06": (0, "odd feature"),
+}
+# ─────────────────────────────────────────────────────────────
+# MC Dropout inference — Welford online algorithm, crash-safe
+# ─────────────────────────────────────────────────────────────
+def run_mc_inference(model, loader, device, cfg,
+                     n_passes=30, cache_dir=None):
+    """
+    Fast batched MC Dropout inference.
+    Uses Welford's online algorithm to compute mean and std
+    per batch without storing all n_passes × N predictions.
+    Memory usage: O(N × 37) regardless of n_passes.
+    Parameters
+    ----------
+    model     : GalaxyViT with enable_mc_dropout() available
+    loader    : test DataLoader
+    device    : inference device
+    cfg       : OmegaConf config
+    n_passes  : number of stochastic forward passes (default 30)
+    cache_dir : if given, saves .npy files and skips if they exist
+    Returns
+    -------
+    mean_all, std_all : [N, 37] float32
+    targets_all       : [N, 37] float32
+    weights_all       : [N, 11] float32
+    """
+    if cache_dir is not None:
+        cache_dir  = Path(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        fp_mean    = cache_dir / "mc_mean.npy"
+        fp_std     = cache_dir / "mc_std.npy"
+        fp_targets = cache_dir / "mc_targets.npy"
+        fp_weights = cache_dir / "mc_weights.npy"
+        if all(p.exists() for p in [fp_mean, fp_std, fp_targets, fp_weights]):
+            log.info("MC cache found — loading from disk (skipping inference).")
+            return (np.load(fp_mean), np.load(fp_std),
+                    np.load(fp_targets), np.load(fp_weights))
+    model.eval()
+    model.enable_mc_dropout()
+    all_means, all_stds, all_targets, all_weights = [], [], [], []
+    log.info("MC Dropout: %d passes × %d-image batches = %d total forward passes",
+             n_passes, loader.batch_size, n_passes * len(loader))
+    for images, targets, weights, _ in tqdm(loader, desc="MC Dropout"):
+        images_dev = images.to(device, non_blocking=True)
+        # Welford online mean and M2
+        mean_acc = None
+        M2_acc   = None
+        count    = 0
+        for _ in range(n_passes):
+            with torch.no_grad():
+                with autocast("cuda", enabled=cfg.training.mixed_precision):
+                    logits = model(images_dev)
+            pred = torch.zeros_like(logits)
+            for q, (s, e) in QUESTION_GROUPS.items():
+                pred[:, s:e] = F.softmax(logits[:, s:e], dim=-1)
+            pred_np = pred.cpu().float().numpy()   # [B, 37]
+            count += 1
+            if mean_acc is None:
+                mean_acc = pred_np.copy()
+                M2_acc   = np.zeros_like(pred_np)
+            else:
+                delta  = pred_np - mean_acc
+                mean_acc += delta / count
+                M2_acc   += delta * (pred_np - mean_acc)
+        std_acc = np.sqrt(M2_acc / (count - 1) if count > 1
+                          else np.zeros_like(M2_acc))
+        all_means.append(mean_acc)
+        all_stds.append(std_acc)
+        all_targets.append(targets.numpy())
+        all_weights.append(weights.numpy())
+    model.disable_mc_dropout()
+    mean_all    = np.concatenate(all_means)
+    std_all     = np.concatenate(all_stds)
+    targets_all = np.concatenate(all_targets)
+    weights_all = np.concatenate(all_weights)
+    if cache_dir is not None:
+        np.save(fp_mean,    mean_all)
+        np.save(fp_std,     std_all)
+        np.save(fp_targets, targets_all)
+        np.save(fp_weights, weights_all)
+        log.info("MC results cached: %s", cache_dir)
+    return mean_all, std_all, targets_all, weights_all
+# ─────────────────────────────────────────────────────────────
+# Figure 1: Uncertainty distributions
+# ─────────────────────────────────────────────────────────────
+def fig_uncertainty_distributions(mean_preds, std_preds,
+                                   targets, weights, save_dir):
+    path_pdf = save_dir / "fig_uncertainty_distributions.pdf"
+    path_png = save_dir / "fig_uncertainty_distributions.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_uncertainty_distributions"); return
+    fig, axes = plt.subplots(3, 4, figsize=(16, 12))
+    axes = axes.flatten()
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        ax    = axes[q_idx]
+        mask  = weights[:, q_idx] >= 0.05
+        std_q = std_preds[mask, start:end].mean(axis=1)
+        ax.hist(std_q, bins=50, color="#6366f1", alpha=0.85,
+                edgecolor="none", density=True)
+        ax.axvline(std_q.mean(), color="#c0392b", linewidth=1.8,
+                   linestyle="--", label=f"Mean = {std_q.mean():.4f}")
+        ax.set_xlabel("MC Dropout std (epistemic uncertainty)")
+        ax.set_ylabel("Density")
+        ax.set_title(
+            f"{q_name}: {QUESTION_LABELS[q_name]}\n"
+            f"$n$ = {mask.sum():,} (w ≥ 0.05)",
+            fontsize=9,
+        )
+        ax.legend(fontsize=7)
+    axes[-1].axis("off")
+    plt.suptitle(
+        "Epistemic uncertainty distributions — MC Dropout (30 passes)\n"
+        "Proposed model (ViT-Base/16 + hierarchical KL+MSE), test set",
+        fontsize=12,
+    )
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_uncertainty_distributions")
+# ─────────────────────────────────────────────────────────────
+# Figure 2: Uncertainty vs. error (Spearman ρ)
+# ─────────────────────────────────────────────────────────────
+def fig_uncertainty_vs_error(mean_preds, std_preds,
+                              targets, weights, save_dir):
+    path_pdf = save_dir / "fig_uncertainty_vs_error.pdf"
+    path_png = save_dir / "fig_uncertainty_vs_error.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_uncertainty_vs_error"); return
+    fig, axes = plt.subplots(3, 4, figsize=(16, 12))
+    axes = axes.flatten()
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        ax   = axes[q_idx]
+        mask = weights[:, q_idx] >= 0.05
+        unc  = std_preds[mask, start:end].mean(axis=1)
+        err  = np.abs(mean_preds[mask, start:end] -
+                      targets[mask, start:end]).mean(axis=1)
+        # Adaptive bin means for trend line
+        n_bins   = 15
+        unc_bins = np.unique(np.percentile(unc, np.linspace(0, 100, n_bins + 1)))
+        bin_ids  = np.clip(np.digitize(unc, unc_bins) - 1, 0, len(unc_bins) - 2)
+        bn_unc   = [unc[bin_ids == b].mean() for b in range(len(unc_bins) - 1)
+                    if (bin_ids == b).any()]
+        bn_err   = [err[bin_ids == b].mean() for b in range(len(unc_bins) - 1)
+                    if (bin_ids == b).any()]
+        ax.scatter(unc, err, alpha=0.04, s=1, color="#94a3b8", rasterized=True)
+        ax.plot(bn_unc, bn_err, "r-o", markersize=4, linewidth=2,
+                label="Bin mean")
+        # Spearman rank correlation (more robust than Pearson for this data)
+        rho, pval = scipy_stats.spearmanr(unc, err)
+        p_str = f"p < 0.001" if pval < 0.001 else f"p = {pval:.3f}"
+        ax.text(0.05, 0.90,
+                f"Spearman ρ = {rho:.3f}\n{p_str}",
+                transform=ax.transAxes, fontsize=7.5,
+                bbox=dict(boxstyle="round,pad=0.25", facecolor="white",
+                          edgecolor="grey", alpha=0.85))
+        ax.set_xlabel("Uncertainty (MC std)")
+        ax.set_ylabel("Absolute error")
+        ax.set_title(f"{q_name}: {QUESTION_LABELS[q_name]}", fontsize=9)
+        ax.legend(fontsize=7)
+    axes[-1].axis("off")
+    plt.suptitle(
+        "Epistemic uncertainty vs. absolute prediction error — per morphological question\n"
+        "Strong Spearman ρ for root/shallow questions; weak ρ for deep conditional branches "
+        "(expected: aleatoric uncertainty dominates when branch is rarely reached)",
+        fontsize=10,
+    )
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_uncertainty_vs_error")
+# ─────────────────────────────────────────────────────────────
+# Table: uncertainty summary
+# ─────────────────────────────────────────────────────────────
+def table_uncertainty_summary(mean_preds, std_preds,
+                               targets, weights, save_dir):
+    path = save_dir / "table_uncertainty_summary.csv"
+    if path.exists():
+        log.info("Skip (exists): table_uncertainty_summary"); return
+    rows = []
+    for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
+        mask = weights[:, q_idx] >= 0.05
+        unc  = std_preds[mask, start:end].mean(axis=1)
+        err  = np.abs(mean_preds[mask, start:end] -
+                      targets[mask, start:end]).mean(axis=1)
+        if mask.sum() > 10:
+            rho, pval = scipy_stats.spearmanr(unc, err)
+        else:
+            rho, pval = float("nan"), float("nan")
+        rows.append({
+            "question"        : q_name,
+            "description"     : QUESTION_LABELS[q_name],
+            "n_reached"       : int(mask.sum()),
+            "mean_uncertainty": round(float(unc.mean()), 5),
+            "std_uncertainty" : round(float(unc.std()),  5),
+            "mean_mae"        : round(float(err.mean()), 5),
+            "spearman_rho"    : round(float(rho),  4),
+            "spearman_pval"   : round(float(pval), 4),
+        })
+    df = pd.DataFrame(rows)
+    df.to_csv(path, index=False)
+    log.info("Saved: table_uncertainty_summary.csv")
+    print("\n" + df.to_string(index=False) + "\n")
+    return df
+# ─────────────────────────────────────────────────────────────
+# Figure 3 + Table: Morphology selection benchmark
+# ─────────────────────────────────────────────────────────────
+def morphology_selection_benchmark(model_results, save_dir):
+    csv_path = save_dir / "table_morphology_selection_benchmark.csv"
+    if csv_path.exists():
+        log.info("Loading existing morphology benchmark...")
+        df = pd.read_csv(csv_path)
+        _fig_morphology_f1(df, save_dir)
+        return df
+    rows = []
+    for model_name, (preds, targets, weights) in model_results.items():
+        for q_name, (ans_idx, ans_label) in SELECTION_ANSWERS.items():
+            start, end = QUESTION_GROUPS[q_name]
+            q_idx      = list(QUESTION_GROUPS.keys()).index(q_name)
+            mask       = weights[:, q_idx] >= 0.05
+            pred_a     = preds[mask,   start + ans_idx]
+            true_a     = targets[mask, start + ans_idx]
+            for thresh in SELECTION_THRESHOLDS:
+                sel       = pred_a >= thresh
+                true_pos  = true_a >= thresh
+                n_sel     = sel.sum()
+                n_tp_all  = true_pos.sum()
+                n_tp      = (sel & true_pos).sum()
+                prec = n_tp / n_sel    if n_sel    > 0 else 0.0
+                rec  = n_tp / n_tp_all if n_tp_all > 0 else 0.0
+                f1   = (2 * prec * rec / (prec + rec)
+                        if (prec + rec) > 0 else 0.0)
+                rows.append({
+                    "model"     : model_name,
+                    "question"  : q_name,
+                    "answer"    : ans_label,
+                    "threshold" : thresh,
+                    "n_selected": int(n_sel),
+                    "n_true_pos": int(n_tp_all),
+                    "precision" : round(float(prec), 4),
+                    "recall"    : round(float(rec),  4),
+                    "f1"        : round(float(f1),   4),
+                })
+    df = pd.DataFrame(rows)
+    df.to_csv(csv_path, index=False)
+    log.info("Saved: table_morphology_selection_benchmark.csv")
+    _fig_morphology_f1(df, save_dir)
+    return df
+def _fig_morphology_f1(df, save_dir):
+    path_pdf = save_dir / "fig_morphology_f1_comparison.pdf"
+    path_png = save_dir / "fig_morphology_f1_comparison.png"
+    if path_pdf.exists() and path_png.exists():
+        log.info("Skip (exists): fig_morphology_f1_comparison"); return
+    thresh = 0.8
+    sub    = df[df["threshold"] == thresh]
+    q_list = list(SELECTION_ANSWERS.keys())
+    models = list(df["model"].unique())
+    x     = np.arange(len(q_list))
+    width = 0.80 / len(models)
+    palette = list(MODEL_COLORS.values())
+    fig, ax = plt.subplots(figsize=(12, 5))
+    for i, model in enumerate(models):
+        f1s = []
+        for q in q_list:
+            row = sub[(sub["model"] == model) & (sub["question"] == q)]
+            f1s.append(float(row["f1"].values[0]) if len(row) > 0 else 0.0)
+        ax.bar(
+            x + i * width, f1s, width,
+            label=model,
+            color=MODEL_COLORS.get(model, palette[i % len(palette)]),
+            alpha=0.85, edgecolor="white", linewidth=0.5,
+        )
+    ax.set_xticks(x + width * (len(models) - 1) / 2)
+    ax.set_xticklabels(
+        [f"{q}\n({SELECTION_ANSWERS[q][1]})" for q in q_list], fontsize=9
+    )
+    ax.set_ylabel("F$_1$ score", fontsize=11)
+    ax.set_title(
+        f"Downstream morphology selection — F$_1$ at threshold $\\tau$ = {thresh}\n"
+        "Higher F$_1$ indicates cleaner, more complete morphological sample selection.",
+        fontsize=11,
+    )
+    ax.legend(fontsize=8)
+    ax.set_ylim(0, 1)
+    ax.grid(True, alpha=0.3, axis="y")
+    ax.set_axisbelow(True)
+    plt.tight_layout()
+    fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
+    fig.savefig(path_png, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    log.info("Saved: fig_morphology_f1_comparison")
+# ─────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config",   required=True)
+    parser.add_argument("--n_passes", type=int, default=30)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load("configs/base.yaml")
+    exp_cfg  = OmegaConf.load(args.config)
+    cfg      = OmegaConf.merge(base_cfg, exp_cfg)
+    device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    save_dir  = Path(cfg.outputs.figures_dir) / "uncertainty"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    cache_dir = save_dir / "mc_cache"
+    ckpt_dir  = Path(cfg.outputs.checkpoint_dir)
+    _, _, test_loader = build_dataloaders(cfg)
+    # ── 1. MC Dropout on proposed model ───────────────────────
+    log.info("Loading proposed model...")
+    proposed = build_model(cfg).to(device)
+    proposed.load_state_dict(
+        torch.load(ckpt_dir / "best_full_train.pt",
+                   map_location="cpu", weights_only=True)["model_state"]
+    )
+    mean_preds, std_preds, targets, weights = run_mc_inference(
+        proposed, test_loader, device, cfg,
+        n_passes=args.n_passes, cache_dir=cache_dir,
+    )
+    log.info("MC Dropout complete: %d galaxies, %d passes.",
+             len(mean_preds), args.n_passes)
+    # ── 2. Uncertainty figures and table ──────────────────────
+    fig_uncertainty_distributions(mean_preds, std_preds, targets, weights, save_dir)
+    fig_uncertainty_vs_error(mean_preds, std_preds, targets, weights, save_dir)
+    table_uncertainty_summary(mean_preds, std_preds, targets, weights, save_dir)
+    # ── 3. Downstream benchmark across all models ─────────────
+    log.info("Building model_results for downstream benchmark...")
+    model_results = {
+        "ViT-Base + KL+MSE (proposed)": (mean_preds, targets, weights),
+    }
+    def _load_resnet(ckpt_name, use_sigmoid):
+        m = ResNet18Baseline(dropout=cfg.model.dropout).to(device)
+        m.load_state_dict(
+            torch.load(ckpt_dir / ckpt_name, map_location="cpu",
+                       weights_only=True)["model_state"]
+        )
+        m.eval()
+        preds_l, tgts_l, wgts_l = [], [], []
+        with torch.no_grad():
+            for images, tgts, wgts, _ in tqdm(test_loader, desc=f"ResNet {ckpt_name}"):
+                images = images.to(device, non_blocking=True)
+                with autocast("cuda", enabled=cfg.training.mixed_precision):
+                    logits = m(images)
+                if use_sigmoid:
+                    p = torch.sigmoid(logits).cpu().numpy()
+                else:
+                    p = logits.detach().cpu().clone()
+                    for q, (s, e) in QUESTION_GROUPS.items():
+                        p[:, s:e] = F.softmax(p[:, s:e], dim=-1)
+                    p = p.numpy()
+                preds_l.append(p)
+                tgts_l.append(tgts.numpy())
+                wgts_l.append(wgts.numpy())
+        return (np.concatenate(preds_l),
+                np.concatenate(tgts_l),
+                np.concatenate(wgts_l))
+    rn_mse_ckpt  = "baseline_resnet18_mse.pt"
+    rn_klm_ckpt  = "baseline_resnet18_klmse.pt"
+    if (ckpt_dir / rn_mse_ckpt).exists():
+        model_results["ResNet-18 + MSE (sigmoid)"] = _load_resnet(
+            rn_mse_ckpt, use_sigmoid=True
+        )
+    if (ckpt_dir / rn_klm_ckpt).exists():
+        model_results["ResNet-18 + KL+MSE"] = _load_resnet(
+            rn_klm_ckpt, use_sigmoid=False
+        )
+    dp = ckpt_dir / "baseline_vit_dirichlet.pt"
+    if dp.exists():
+        vit_dir = build_dirichlet_model(cfg).to(device)
+        vit_dir.load_state_dict(
+            torch.load(dp, map_location="cpu", weights_only=True)["model_state"]
+        )
+        vit_dir.eval()
+        d_p, d_t, d_w = [], [], []
+        with torch.no_grad():
+            for images, tgts, wgts, _ in tqdm(test_loader, desc="Dirichlet"):
+                images = images.to(device, non_blocking=True)
+                with autocast("cuda", enabled=cfg.training.mixed_precision):
+                    alpha = vit_dir(images)
+                p, t, w = dirichlet_predictions_to_numpy(alpha, tgts, wgts)
+                d_p.append(p); d_t.append(t); d_w.append(w)
+        model_results["ViT-Base + Dirichlet (Zoobot-style)"] = (
+            np.concatenate(d_p),
+            np.concatenate(d_t),
+            np.concatenate(d_w),
+        )
+    df_sel = morphology_selection_benchmark(model_results, save_dir)
+    log.info("=" * 60)
+    log.info("DOWNSTREAM F1 @ τ = 0.8")
+    log.info("=" * 60)
+    summary = df_sel[df_sel["threshold"] == 0.8][
+        ["model", "question", "answer", "precision", "recall", "f1"]
+    ]
+    log.info("\n%s\n", summary.to_string(index=False))
+    log.info("All outputs saved to: %s", save_dir)
+if __name__ == "__main__":
+    main()