Upload 2 files

Browse files

Example for using the model, with required classes.

Files changed (2) hide show

examples/demo.py +86 -0
examples/stradavit_model.py +229 -0

examples/demo.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Example: load a shipped StradaViT checkpoint and extract embeddings.
+This mirrors the embedding policy:
+  - Use the ViT encoder's `last_hidden_state`
+  - Mean-pool patch tokens (drop CLS): `hs[:, 1:, :].mean(dim=1)`
+Expected checkpoint layout (from our training scripts):
+  <RUN_ROOT>/checkpoints/
+    - config.json
+    - pytorch_model.bin (or model.safetensors)
+    - preprocessor_config.json
+    - (optional) tokenizer/feature extractor extras
+Usage:
+  python3 examples/use_shipped_stradavit_model.py \\
+    --checkpoint /path/to/run/checkpoints \\
+    --image /path/to/image.png
+"""
+from __future__ import annotations
+import argparse
+import os
+from typing import Any
+import torch
+import StradaViTModel
+def load_model_and_processor(checkpoint_dir: str):
+    """
+    Loads a StradaViT checkpoint and the matching HF image processor.
+    """
+    from transformers import ViTImageProcessor, ViTMAEConfig
+    config = ViTMAEConfig.from_pretrained(checkpoint_dir)
+    processor = ViTImageProcessor.from_pretrained(checkpoint_dir)
+    model = StradaViTModel.from_pretrained(checkpoint_dir)
+    model.eval()
+    return model, processor, config
+def load_image(path: str):
+    from PIL import Image
+    img = Image.open(path).convert("RGB")
+    return img
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--checkpoint", required=True, help="Path to <run_root>/checkpoints (contains config + weights)")
+    ap.add_argument("--image", required=True, help="Path to an image file (png/jpg/...)")
+    ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    args = ap.parse_args(argv)
+    ckpt = os.path.abspath(args.checkpoint)
+    if not os.path.isdir(ckpt):
+        raise FileNotFoundError(f"--checkpoint must be a directory: {ckpt}")
+    device = torch.device(args.device)
+    model, processor, config = load_model_and_processor(ckpt)
+    model.to(device)
+    img = load_image(args.image)
+    inputs: dict[str, Any] = processor(images=img, return_tensors="pt")
+    pixel_values = inputs["pixel_values"].to(device)
+    with torch.inference_mode():
+        out = model(pixel_values=pixel_values)
+        emb = out.embedding
+    print(
+        f"Loaded checkpoint: {ckpt}\n"
+        f"  model_type={getattr(config, 'model_type', None)} use_dino_encoder={bool(getattr(config, 'use_dino_encoder', False))} "
+        f"n_registers={int(getattr(config, 'n_registers', 0) or 0)}\n"
+        f"  image_size={int(getattr(config, 'image_size', 0) or 0)} patch_size={int(getattr(config, 'patch_size', 0) or 0)}\n"
+        f"Embedding shape: {tuple(emb.shape)} dtype={emb.dtype} device={emb.device}"
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

examples/stradavit_model.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+import torch
+import torch.nn as nn
+@dataclass
+class StradaViTOutput:
+    embedding: torch.Tensor
+    last_hidden_state: torch.Tensor | None = None
+    hidden_states: Any | None = None
+    attentions: Any | None = None
+def _pool_patch_mean(last_hidden_state: torch.Tensor) -> torch.Tensor:
+    # Mirror `pretraining/ft_test_llrd.py`: mean over all non-CLS tokens.
+    if last_hidden_state.dim() != 3 or last_hidden_state.size(1) < 2:
+        raise ValueError(f"Expected (B, T, D) with CLS+patches, got {tuple(last_hidden_state.shape)}")
+    return last_hidden_state[:, 1:, :].mean(dim=1)
+class StradaViTModel(nn.Module):
+    """
+    Lightweight encoder-only wrapper that exposes a consistent embedding API for:
+      - vanilla ViTMAE checkpoints (any patch size)
+      - register-aware / Dinov2Encoder-backed MAE checkpoints
+    Embedding policy matches `pretraining/ft_test_llrd.py`:
+      embedding = mean over patch tokens (drop CLS).
+    """
+    def __init__(self, backbone: nn.Module):
+        super().__init__()
+        self.backbone = backbone
+        self.config = getattr(backbone, "config", None)
+    @classmethod
+    def from_pretrained(cls, checkpoint_path: str, **kwargs):
+        """
+        Loads a backbone in a way that is compatible with our checkpoints:
+          - If config indicates registers or Dinov2Encoder path, use `ViTMAEWithRegistersModel`.
+          - Else use `ViTModel` to avoid MAE random masking/shuffling in downstream usage.
+        """
+        from transformers import ViTModel, ViTMAEConfig
+        config = ViTMAEConfig.from_pretrained(checkpoint_path)
+        use_dino_encoder = bool(getattr(config, "use_dino_encoder", False))
+        n_registers = int(getattr(config, "n_registers", 0) or 0)
+        if use_dino_encoder or n_registers > 0:
+            from pretraining.vit_mae_registers import ViTMAEWithRegistersModel
+            backbone = ViTMAEWithRegistersModel.from_pretrained(
+                checkpoint_path,
+                n_registers=n_registers,
+                ignore_mismatched_sizes=True,
+                **kwargs,
+            )
+        else:
+            # ViTModel loads MAE weights with an expected "vit_mae -> vit" type conversion warning.
+            backbone = ViTModel.from_pretrained(
+                checkpoint_path,
+                add_pooling_layer=False,
+                **kwargs,
+            )
+        return cls(backbone=backbone)
+    def _forward_backbone(self, pixel_values: torch.Tensor, **kwargs) -> Any:
+        """
+        Runs the backbone and returns its native outputs.
+        For MAE-family backbones, we disable embeddings.random_masking to get a full-image encoding.
+        """
+        bb = self.backbone
+        emb = getattr(bb, "embeddings", None)
+        if emb is None or not hasattr(emb, "random_masking"):
+            return bb(pixel_values=pixel_values, **kwargs)
+        orig_random_masking = emb.random_masking
+        def _random_masking_noop(self, x: torch.Tensor, noise: torch.Tensor | None = None):
+            if not isinstance(x, torch.Tensor):
+                x = torch.as_tensor(x)
+            if x.dim() != 3:
+                B = x.size(0) if x.dim() > 0 else 1
+                L = x.size(1) if x.dim() > 1 else 1
+                mask = x.new_zeros(B, L)
+                ids_restore = torch.arange(L, device=x.device).unsqueeze(0).expand(B, -1)
+                return x, mask, ids_restore
+            B, L, _ = x.shape
+            device = x.device
+            mask = x.new_zeros(B, L)
+            ids_restore = torch.arange(L, device=device).unsqueeze(0).expand(B, -1)
+            return x, mask, ids_restore
+        try:
+            import types
+            emb.random_masking = types.MethodType(_random_masking_noop, emb)
+            return bb(pixel_values=pixel_values, **kwargs)
+        finally:
+            emb.random_masking = orig_random_masking
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: bool | None = None,
+        output_attentions: bool | None = None,
+        return_dict: bool | None = True,
+        **kwargs,
+    ) -> StradaViTOutput:
+        outputs = self._forward_backbone(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+            **kwargs,
+        )
+        last_hidden_state = getattr(outputs, "last_hidden_state", None)
+        if last_hidden_state is None:
+            # Some HF models may return a tuple.
+            if isinstance(outputs, (tuple, list)) and len(outputs) > 0:
+                last_hidden_state = outputs[0]
+            else:
+                raise ValueError("Backbone output does not include last_hidden_state")
+        emb = _pool_patch_mean(last_hidden_state)
+        out = StradaViTOutput(
+            embedding=emb,
+            last_hidden_state=last_hidden_state,
+            hidden_states=getattr(outputs, "hidden_states", None),
+            attentions=getattr(outputs, "attentions", None),
+        )
+        return out
+class StradaViTForImageClassification(nn.Module):
+    """
+    Simple classification head on top of `StradaViTModel` embeddings.
+    Head policy:
+      - LayerNorm (+ optional dropout) + Linear for all MAE-family variants.
+    Rationale: consistent ViT fine-tuning protocol and batch-size agnostic normalization.
+    """
+    def __init__(
+        self,
+        checkpoint_path: str,
+        num_labels: int,
+        class_weights: list[float] | None = None,
+        head_norm: str = "ln",  # kept for backward compatibility; must be "ln" or "auto"
+        n_registers: int | None = None,  # accepted for call-site compatibility; config remains source of truth
+    ):
+        super().__init__()
+        self.backbone = StradaViTModel.from_pretrained(checkpoint_path)
+        self.config = getattr(self.backbone, "config", None)
+        self.num_labels = int(num_labels)
+        hidden_size = None
+        if self.config is not None:
+            hidden_size = getattr(self.config, "hidden_size", None)
+        if hidden_size is None:
+            raise ValueError("Could not infer hidden_size from backbone config.")
+        if class_weights is not None:
+            self.register_buffer(
+                "class_weights",
+                torch.tensor(class_weights, dtype=torch.float32),
+            )
+        else:
+            self.class_weights = None
+        cfg_n_regs = int(getattr(self.config, "n_registers", 0) or 0) if self.config is not None else 0
+        cfg_use_dino = bool(getattr(self.config, "use_dino_encoder", False)) if self.config is not None else False
+        if n_registers is not None and int(n_registers) != cfg_n_regs:
+            raise ValueError(f"n_registers={int(n_registers)} does not match checkpoint config.n_registers={cfg_n_regs}.")
+        if head_norm not in ("auto", "ln"):
+            raise ValueError("head_norm must be one of {'ln','auto'} (BatchNorm is disabled).")
+        # "auto" is retained for older call sites; it maps to LN unconditionally now.
+        head_norm = "ln"
+        dropout_prob = float(getattr(self.config, "classifier_dropout_prob", 0.0) or 0.0) if self.config is not None else 0.0
+        ln_eps = float(getattr(self.config, "layer_norm_eps", 1e-6) or 1e-6) if self.config is not None else 1e-6
+        self.norm = nn.LayerNorm(int(hidden_size), eps=ln_eps)
+        self.dropout = nn.Dropout(dropout_prob)
+        self.classifier = nn.Linear(int(hidden_size), self.num_labels)
+        nn.init.trunc_normal_(self.classifier.weight, std=0.02)
+        if self.classifier.bias is not None:
+            nn.init.zeros_(self.classifier.bias)
+    def forward(self, pixel_values=None, labels=None, **kwargs):
+        out = self.backbone(pixel_values=pixel_values, **kwargs)
+        x = out.embedding
+        x = self.norm(x)
+        x = self.dropout(x)
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            if getattr(self, "class_weights", None) is not None:
+                loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        # Prefer HF's standard output container when available (Trainer-friendly),
+        # but keep a dict fallback so this module can be imported without transformers installed.
+        try:
+            from transformers.modeling_outputs import ImageClassifierOutput  # type: ignore
+            return ImageClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=out.hidden_states,
+                attentions=out.attentions,
+            )
+        except Exception:
+            return {
+                "loss": loss,
+                "logits": logits,
+                "hidden_states": out.hidden_states,
+                "attentions": out.attentions,
+            }