File size: 25,092 Bytes

"""
TILA (Temporal Inversion-aware Learning and Alignment) — Model Architecture

Paper: "Temporal Inversion for Learning Interval Change in Chest X-Rays" (CVPR 2026)
       http://arxiv.org/abs/2604.04563

This module contains the full model architecture for TILA, built on top of the
BioViL-T (ResNet-50 + Vision Transformer pooler) backbone and CXR-BERT text encoder.

Dependencies:
    pip install torch torchvision timm transformers safetensors
"""

from __future__ import annotations

import math
from dataclasses import dataclass
from functools import partial
from typing import Any, Callable, Optional, Sequence, Set, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.layers import DropPath, Mlp, trunc_normal_
from torchvision.models.resnet import Bottleneck, conv1x1


# ──────────────────────────────────────────────────────────────────────────────
# Output types
# ──────────────────────────────────────────────────────────────────────────────


@dataclass
class ImageModelOutput:
    img_embedding: torch.Tensor
    patch_embeddings: torch.Tensor
    projected_global_embedding: torch.Tensor
    class_logits: Optional[torch.Tensor]
    projected_patch_embeddings: torch.Tensor


# ──────────────────────────────────────────────────────────────────────────────
# ResNet-50 backbone
# ──────────────────────────────────────────────────────────────────────────────


class ResNet(nn.Module):
    """Standard ResNet-50 (torchvision-compatible) without the final FC layer in forward."""

    def __init__(
        self,
        layers: Sequence[int] = (3, 4, 6, 3),
        num_classes: int = 1000,
        zero_init_residual: bool = False,
        replace_stride_with_dilation: Optional[Sequence[bool]] = None,
    ):
        super().__init__()
        block = Bottleneck
        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            replace_stride_with_dilation = [False, False, False]

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        # Weight init
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
                    nn.init.constant_(m.bn3.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = [block(self.inplanes, planes, stride, downsample, 1, 64, previous_dilation, nn.BatchNorm2d)]
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=self.dilation, norm_layer=nn.BatchNorm2d))
        return nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x  # patch features [B, 2048, H, W]


# ──────────────────────────────────────────────────────────────────────────────
# Vision Transformer Pooler (temporal attention)
# ──────────────────────────────────────────────────────────────────────────────


class SinePositionEmbedding:
    def __init__(self, embedding_dim: int = 64, temperature: int = 10000,
                 normalize: bool = False, scale: Optional[float] = None):
        self.embedding_dim = embedding_dim
        self.temperature = temperature
        self.normalize = normalize
        self.scale = scale if scale is not None else 2 * math.pi

    def __call__(self, mask: torch.Tensor) -> torch.Tensor:
        B, H, W = mask.shape
        y_embed = mask.cumsum(1, dtype=torch.float32)
        x_embed = mask.cumsum(2, dtype=torch.float32)
        if self.normalize:
            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32)
        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
        return torch.cat((pos_y, pos_x), dim=3).view(B, H * W, self.embedding_dim * 2)


class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False,
                 attn_drop: float = 0.0, proj_drop: float = 0.0):
        super().__init__()
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        self.proj_q = nn.Linear(dim, dim, bias=qkv_bias)
        self.proj_k = nn.Linear(dim, dim, bias=qkv_bias)
        self.proj_v = nn.Linear(dim, dim, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, k, q, v):
        B, N, C = v.shape
        h = self.num_heads
        wq = self.proj_q(q).reshape(B, N, h, C // h).permute(0, 2, 1, 3)
        wk = self.proj_k(k).reshape(B, N, h, C // h).permute(0, 2, 1, 3)
        wv = self.proj_v(v).reshape(B, N, h, C // h).permute(0, 2, 1, 3)
        attn = (wq @ wk.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        o = (attn @ wv).transpose(1, 2).reshape(B, N, C)
        return self.proj_drop(self.proj(o))


class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=1.0, qkv_bias=False,
                 drop=0.0, attn_drop=0.0, drop_path=0.0,
                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = MultiHeadAttentionLayer(dim, num_heads, qkv_bias, attn_drop, drop)
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.norm2 = norm_layer(dim)
        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)

    def forward(self, x, pos_and_type_embed=None):
        x_norm = self.norm1(x)
        if pos_and_type_embed is not None:
            x_norm = x_norm + pos_and_type_embed
        x = x + self.drop_path(self.attn(x_norm, x_norm, x_norm))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class VisionTransformerPooler(nn.Module):
    def __init__(self, input_dim: int, grid_shape: Tuple[int, int],
                 num_heads: int = 8, num_blocks: int = 3,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6)):
        super().__init__()
        block_kwargs = dict(dim=input_dim, num_heads=num_heads, mlp_ratio=1.0,
                            drop=0.10, attn_drop=0.10, drop_path=0.25,
                            act_layer=nn.GELU, norm_layer=norm_layer)
        self.blocks = nn.ModuleList([Block(**block_kwargs) for _ in range(num_blocks)])
        self.norm_post = norm_layer(input_dim)
        self.grid_shape = grid_shape
        self.num_patches = grid_shape[0] * grid_shape[1]

        self.type_embed = nn.Parameter(torch.zeros(2, 1, input_dim))
        trunc_normal_(self.type_embed, std=0.02)

        self.pos_drop = nn.Dropout(p=0.10)
        pos_embed = SinePositionEmbedding(input_dim // 2, normalize=True)(
            torch.ones([1, grid_shape[0], grid_shape[1]]))
        self.register_buffer("pos_embed", pos_embed, persistent=False)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, current_image, previous_image=None):
        B, C, H, W = current_image.shape
        if previous_image is not None:
            prev = previous_image.view(B, C, H * W).transpose(1, 2)
        else:
            prev = None
        cur = current_image.view(B, C, H * W).transpose(1, 2)
        pos = self.pos_embed.repeat(B, 1, 1)

        L = cur.shape[1]
        type_emb = self.type_embed[0].expand(B, L, -1)
        if prev is not None:
            x = torch.cat((cur, prev), dim=1)
            pos = torch.cat((pos, pos), dim=1)
            type_emb = torch.cat((type_emb, self.type_embed[1].expand(B, L, -1)), dim=1)
        else:
            x = cur

        pos_type = pos + type_emb
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x, pos_type)
        x = self.norm_post(x)

        return x[:, :self.num_patches].transpose(1, 2).view(B, C, H, W)


# ──────────────────────────────────────────────────────────────────────────────
# Multi-image encoder (temporal)
# ──────────────────────────────────────────────────────────────────────────────


class MLP(nn.Module):
    """Projection MLP (1x1 conv based)."""
    def __init__(self, input_dim, output_dim, hidden_dim=None, use_1x1_convs=False):
        super().__init__()
        if use_1x1_convs and hidden_dim is not None:
            self.model = nn.Sequential(
                nn.Conv2d(input_dim, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(inplace=True),
                nn.Conv2d(hidden_dim, output_dim, 1, bias=True),
            )
        elif hidden_dim is not None:
            self.model = nn.Sequential(
                nn.Linear(input_dim, hidden_dim, bias=False),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(inplace=True),
                nn.Linear(hidden_dim, output_dim, bias=True),
            )
        else:
            self.model = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.model(x)


class MultiImageEncoder(nn.Module):
    """BioViL-T style multi-image encoder: ResNet-50 backbone + ViT temporal pooler."""

    def __init__(self):
        super().__init__()
        self.encoder = ResNet()
        backbone_out_dim = 2048  # ResNet-50 output channels
        output_dim = 256

        self.backbone_to_vit = nn.Conv2d(backbone_out_dim, output_dim, 1, bias=False)
        self.vit_pooler = VisionTransformerPooler(input_dim=output_dim, grid_shape=(14, 14))
        self.missing_previous_emb = nn.Parameter(torch.zeros(1, output_dim, 1, 1))
        trunc_normal_(self.missing_previous_emb, std=0.02)

    def forward(self, current_image, previous_image=None, return_patch_embeddings=False):
        B = current_image.shape[0]
        if previous_image is not None:
            x = torch.cat([current_image, previous_image], dim=0)
            x = self.encoder(x)
            x = self.backbone_to_vit(x)
            patch_x, patch_prev = x[:B], x[B:]
            diff_x = self.vit_pooler(current_image=patch_x, previous_image=patch_prev)
        else:
            x = self.encoder(current_image)
            patch_x = self.backbone_to_vit(x)
            _, _, W, H = patch_x.shape
            diff_x = self.missing_previous_emb.repeat(B, 1, W, H)

        patch_fused = torch.cat([patch_x, diff_x], dim=1)  # [B, 512, H, W]
        avg_pooled = torch.flatten(F.adaptive_avg_pool2d(patch_fused, (1, 1)), 1)

        if return_patch_embeddings:
            return patch_fused, avg_pooled
        return avg_pooled


class TILAImageEncoder(nn.Module):
    """Full TILA image encoder: MultiImageEncoder + projection head.

    Outputs 128-dim normalized embeddings suitable for CLIP-style retrieval.
    """

    JOINT_FEATURE_SIZE = 128

    def __init__(self):
        super().__init__()
        self.encoder = MultiImageEncoder()
        self.projector = MLP(
            input_dim=512,  # patch_x (256) + diff_x (256)
            output_dim=self.JOINT_FEATURE_SIZE,
            hidden_dim=self.JOINT_FEATURE_SIZE,
            use_1x1_convs=True,
        )

    def forward(self, current_image, previous_image=None):
        patch_fused, pooled = self.encoder(current_image, previous_image, return_patch_embeddings=True)
        projected_patch = self.projector(patch_fused)
        projected_global = torch.mean(projected_patch, dim=(2, 3))
        return ImageModelOutput(
            img_embedding=pooled,
            patch_embeddings=patch_fused,
            class_logits=None,
            projected_patch_embeddings=projected_patch,
            projected_global_embedding=projected_global,
        )


# ──────────────────────────────────────────────────────────────────────────────
# Text encoder (BioViL-T CXR-BERT + projection)
# ──────────────────────────────────────────────────────────────────────────────


TEXT_MODEL_NAME = "microsoft/BiomedVLP-BioViL-T"


class TextEncoder(nn.Module):
    """CXR-BERT text encoder with a projection head to 128-dim.

    Loads the pretrained BioViL-T text model and adds a LayerNorm + Linear
    projection from 768-dim CLS embeddings to 128-dim joint space.
    """

    def __init__(self):
        super().__init__()
        from transformers import AutoConfig, AutoModel

        config = AutoConfig.from_pretrained(TEXT_MODEL_NAME, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(
            TEXT_MODEL_NAME, config=config, trust_remote_code=True,
        )
        self.projection = nn.Sequential(
            nn.LayerNorm(config.hidden_size),
            nn.Linear(config.hidden_size, 128),
        )

    def forward(self, text_inputs: dict) -> torch.Tensor:
        """Encode tokenized text to 128-dim embeddings.

        Args:
            text_inputs: Dict from tokenizer (input_ids, attention_mask, etc.)

        Returns:
            Projected CLS embeddings [B, 128]
        """
        outputs = self.model(**text_inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        if cls_emb.dtype != next(self.projection.parameters()).dtype:
            cls_emb = cls_emb.to(next(self.projection.parameters()).dtype)
        return self.projection(cls_emb)


# ──────────────────────────────────────────────────────────────────────────────
# Interval change classifier head
# ──────────────────────────────────────────────────────────────────────────────


class IntervalChangeClassifier(nn.Module):
    """Binary classifier head for interval change detection.

    Takes 128-dim projected embeddings and outputs a change probability.
    """

    def __init__(self):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, embedding: torch.Tensor) -> torch.Tensor:
        """Returns logit (pre-sigmoid). Apply torch.sigmoid() to get probability."""
        return self.head(embedding).squeeze(-1)


# ──────────────────────────────────────────────────────────────────────────────
# Full model wrapper
# ──────────────────────────────────────────────────────────────────────────────


try:
    from transformers import PreTrainedModel
    from configuration_tila import TILAConfig
    _BASE_CLASS = PreTrainedModel
    _HAS_TRANSFORMERS = True
except ImportError:
    _BASE_CLASS = nn.Module
    _HAS_TRANSFORMERS = False


class TILAModel(_BASE_CLASS):
    """TILA model with image encoder, text encoder, and interval change classifier.

    Usage:
        # Load from local safetensors
        model = TILAModel.from_pretrained("model.safetensors")

        # Load via AutoModel (requires config.json + trust_remote_code)
        from transformers import AutoModel
        model = AutoModel.from_pretrained("lukeingawesome/TILA", trust_remote_code=True)

        # Get 128-dim image embeddings
        emb = model.get_embeddings(current_img, previous_img)

        # Get 128-dim text embeddings
        text_emb = model.encode_text(["Improved pulmonary edema."])

        # Predict interval change
        result = model.get_interval_change_prediction(current_img, previous_img)
    """

    if _HAS_TRANSFORMERS:
        config_class = TILAConfig

    def __init__(self, config=None):
        if _HAS_TRANSFORMERS and config is None:
            config = TILAConfig()
        if _HAS_TRANSFORMERS:
            super().__init__(config)
        else:
            super().__init__()
        self.image_encoder = TILAImageEncoder()
        self.text_encoder = TextEncoder()
        self.change_classifier = IntervalChangeClassifier()

    @torch.no_grad()
    def encode_text(self, texts: list) -> torch.Tensor:
        """Encode text prompts to 128-dim normalized embeddings.

        Args:
            texts: List of text strings

        Returns:
            Normalized text embeddings [N, 128]
        """
        from transformers import AutoTokenizer
        if not hasattr(self, '_tokenizer'):
            self._tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME, padding_side="right")
        device = next(self.parameters()).device
        tokens = self._tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        self.eval()
        emb = self.text_encoder(tokens)
        return F.normalize(emb.float(), p=2, dim=1)

    @torch.no_grad()
    def get_embeddings(
        self, current_image: torch.Tensor, previous_image: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """Extract 128-dim projected global embeddings from a pair of chest X-rays.

        Args:
            current_image: Current CXR tensor [B, 3, 448, 448]
            previous_image: Previous CXR tensor [B, 3, 448, 448] (optional)

        Returns:
            Normalized 128-dim embeddings [B, 128]
        """
        self.eval()
        out = self.image_encoder(current_image, previous_image)
        return F.normalize(out.projected_global_embedding.float(), p=2, dim=1)

    # Thresholds calibrated on validation set (AUC=0.7558)
    THRESHOLDS = {
        "default": 0.5000,   # Standard sigmoid midpoint
        "bestf1": 0.2886,    # Youden's J — best F1=0.7210, sens=0.7798, spec=0.6166
        "spec95": 0.6370,    # Specificity ~0.95 — sens=0.1752, spec=0.9502
    }

    @torch.no_grad()
    def get_interval_change_prediction(
        self,
        current_image: torch.Tensor,
        previous_image: torch.Tensor,
        mode: str = "bestf1",
    ) -> torch.Tensor:
        """Predict interval change between two chest X-rays.

        Args:
            current_image: Current CXR tensor [B, 3, 448, 448]
            previous_image: Previous CXR tensor [B, 3, 448, 448]
            mode: Threshold mode — one of:
                "default"  : threshold=0.50 (standard sigmoid cutoff)
                "bestf1"   : threshold=0.29 (maximizes F1, balanced sens/spec)
                "spec95"   : threshold=0.64 (targets 95% specificity, conservative)

        Returns:
            Dict with keys:
                "probabilities": raw change probabilities [B]
                "predictions":   binary predictions [B] (0=no change, 1=change)
                "threshold":     threshold used (float)
        """
        if mode not in self.THRESHOLDS:
            raise ValueError(f"mode must be one of {list(self.THRESHOLDS.keys())}, got '{mode}'")

        self.eval()
        out = self.image_encoder(current_image, previous_image)
        logits = self.change_classifier(out.projected_global_embedding)
        probs = torch.sigmoid(logits.float())

        threshold = self.THRESHOLDS[mode]
        preds = (probs >= threshold).long()

        return {"probabilities": probs, "predictions": preds, "threshold": threshold}

    @classmethod
    def from_pretrained(cls, path_or_repo: str, device: str = "cpu", **kwargs) -> "TILAModel":
        """Load model from a local file or HuggingFace Hub.

        Args:
            path_or_repo: Local path to model.safetensors, or HF repo ID (e.g. "lukeingawesome/TILA")
            device: Device to load onto

        Examples:
            model = TILAModel.from_pretrained("model.safetensors")
            model = TILAModel.from_pretrained("lukeingawesome/TILA")
        """
        import os

        # If called by HF's AutoModel, it passes config as first positional arg
        config = kwargs.pop("config", None)

        # Determine if this is a local file or a HF repo
        if os.path.isfile(path_or_repo):
            safetensors_path = path_or_repo
        elif os.path.isdir(path_or_repo):
            safetensors_path = os.path.join(path_or_repo, "model.safetensors")
        else:
            from huggingface_hub import hf_hub_download
            safetensors_path = hf_hub_download(
                repo_id=path_or_repo,
                filename="model.safetensors",
            )

        model = cls(config=config)

        if safetensors_path.endswith(".safetensors"):
            from safetensors.torch import load_file
            state_dict = load_file(safetensors_path, device=device)
            for k, v in state_dict.items():
                if v.dim() == 1 and v.shape[0] == 1 and "num_batches_tracked" in k:
                    state_dict[k] = v.squeeze(0)
        else:
            state_dict = torch.load(safetensors_path, map_location=device, weights_only=True)

        model.load_state_dict(state_dict, strict=False)
        model.eval()
        return model