Update SigLino siglino-70M (full content push)

Browse files

Files changed (8) hide show

README.md +23 -9
config.json +5 -5
configuration_siglino.py +91 -0
image_processing_siglino.py +121 -0
image_processor.py +2 -2
modeling_siglino.py +339 -0
preprocessor_config.json +2 -2
utils.py +13 -13

README.md CHANGED Viewed

@@ -6,17 +6,19 @@ tags:
   - image-feature-extraction
 ---
-# AMoE-Dense-S
 **Accepted at CVPR 2026**
-[![Project Website](https://img.shields.io/badge/Project-Website-blue)](https://sofianchay.github.io/amoe/)
 [![arXiv](https://img.shields.io/badge/arXiv-2512.20157-b31b1b.svg)](https://arxiv.org/abs/2512.20157)
-[![GitHub](https://img.shields.io/badge/GitHub-Code-black)](https://github.com/tiiuae/amoe)
-Small dense variant of AMoE. 0.07B parameters.
-Part of the [AMoE model family](https://huggingface.co/collections/tiiuae/amoe-agglomerative-moe-vision-foundation-models).
 ## Usage
@@ -25,7 +27,7 @@ import torch
 from PIL import Image
 from transformers import AutoModel, AutoImageProcessor
-model_id = "tiiuae/amoe-dense-S"
 model = AutoModel.from_pretrained(model_id, trust_remote_code=True).to("cuda", dtype=torch.bfloat16)
 processor = AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
@@ -36,8 +38,8 @@ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
 with torch.no_grad():
     outputs = model(**inputs)
-# Options: 'amoe' (512d), 'siglip2' (1152d), 'dinov3' (1024d)
-patch_features = outputs["patch_features"]["amoe"]         # (Batch, Tokens, 512)
 summary_features = outputs["summary_features"]["siglip2"]  # (Batch, 1152)
 ```
@@ -53,11 +55,23 @@ summary_features = outputs["summary_features"]["siglip2"]  # (Batch, 1152)
 | Patch Size | 16x16 |
 | Teachers | DINOv3, SigLIP2 |
 ## Citation
 ```bibtex
 @article{chaybouti2025amoe,
-  title={AMOE: Agglomerative Mixture-of-Experts Vision Foundation Models},
   author={Chaybouti, Sofian and Narayan, Sanath and Dahou, Yasser and Le Khac, Phuc H. and Singh, Ankit and Huynh, Ngoc Dung and Para, Wamiq Reyaz and Kuehne, Hilde and Hacid, Hakim},
   journal={arXiv preprint arXiv:2512.20157},
   year={2025}

   - image-feature-extraction
 ---
+# SigLino-70M
 **Accepted at CVPR 2026**
+[![Project Website](https://img.shields.io/badge/Project-Website-blue)](https://sofianchay.github.io/siglino/)
 [![arXiv](https://img.shields.io/badge/arXiv-2512.20157-b31b1b.svg)](https://arxiv.org/abs/2512.20157)
+[![GitHub](https://img.shields.io/badge/GitHub-Code-black)](https://github.com/tiiuae/siglino)
+This work stems from the **CVPR 2026 AMoE paper**, which designs and applies distillation into a Mixture-of-Experts (MoE) vision architecture. We have chosen the name **SigLino** for better clarity (SigLIP2 + DINOv3).
+Dense variant of SigLino. 70M parameters.
+Part of the [SigLino model family](https://huggingface.co/collections/tiiuae/siglino-vision-foundation-models).
 ## Usage
 from PIL import Image
 from transformers import AutoModel, AutoImageProcessor
+model_id = "tiiuae/siglino-70M"
 model = AutoModel.from_pretrained(model_id, trust_remote_code=True).to("cuda", dtype=torch.bfloat16)
 processor = AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
 with torch.no_grad():
     outputs = model(**inputs)
+# Options: 'siglino' (512d), 'siglip2' (1152d), 'dinov3' (1024d)
+patch_features = outputs["patch_features"]["siglino"]         # (Batch, Tokens, 512)
 summary_features = outputs["summary_features"]["siglip2"]  # (Batch, 1152)
 ```
 | Patch Size | 16x16 |
 | Teachers | DINOv3, SigLIP2 |
+## Results (512x512, ensemble features)
+| Task | Metric | Score |
+|------|--------|-------|
+| kNN (ImageNet) | Acc | 81.7 |
+| kNN (6-dataset avg) | Acc | 86.2 |
+| Zero-shot cls (ImageNet) | Acc | 71.2 |
+| Flickr30K I2T | R@1 | 90.5 |
+| MSCOCO I2T | R@1 | 65.4 |
+| Pascal VOC (1024) | mIoU | 84.8 |
+| Cityscapes (1024) | mIoU | 61.6 |
 ## Citation
 ```bibtex
 @article{chaybouti2025amoe,
+  title={AMoE: Agglomerative Mixture-of-Experts Vision Foundation Models},
   author={Chaybouti, Sofian and Narayan, Sanath and Dahou, Yasser and Le Khac, Phuc H. and Singh, Ankit and Huynh, Ngoc Dung and Para, Wamiq Reyaz and Kuehne, Hilde and Hacid, Hakim},
   journal={arXiv preprint arXiv:2512.20157},
   year={2025}

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "activation": "silu",
   "architectures": [
-    "AMOEModel"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_amoe.AMOEConfig",
-    "AutoImageProcessor": "image_processing_amoe.AMOEImageProcessor",
-    "AutoModel": "modeling_amoe.AMOEModel"
   },
   "channel_size": 3,
   "dim": 512,
@@ -16,7 +16,7 @@
   "first_n_layers_dense": 12,
   "head_dim": 64,
   "max_seq_len": 8192,
-  "model_type": "amoe",
   "moe_args": {
     "activation": "silu",
     "num_experts": 1,

 {
   "activation": "silu",
   "architectures": [
+    "SigLinoModel"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_siglino.SigLinoConfig",
+    "AutoImageProcessor": "image_processing_siglino.SigLinoImageProcessor",
+    "AutoModel": "modeling_siglino.SigLinoModel"
   },
   "channel_size": 3,
   "dim": 512,
   "first_n_layers_dense": 12,
   "head_dim": 64,
   "max_seq_len": 8192,
+  "model_type": "siglino",
   "moe_args": {
     "activation": "silu",
     "num_experts": 1,

configuration_siglino.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from transformers import PretrainedConfig
+from typing import Optional, List, Union, Dict, Tuple
+class SigLinoConfig(PretrainedConfig):
+    """
+    Configuration class to store the configuration of an `SigLinoModel`.
+    """
+    model_type = "siglino"
+    def __init__(
+        self,
+        dim: int = 768,
+        n_layers: int = 18,
+        n_heads: int = 12,
+        head_dim: Optional[int] = 128,
+        n_kv_heads: Optional[int] = 4,
+        # MoE configuration
+        moe_dim: int = 768,
+        moe_args: Optional[Dict] = None,
+        # Dense FFN configuration
+        first_n_layers_dense: int = 0,
+        ffn_dim: Optional[int] = None,
+        activation: str = "silu",
+        # Vision settings
+        channel_size: int = 3,
+        spatial_patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        # RoPE settings
+        enable_3d_rope: bool = True,
+        rope_theta: float = 100000.0,
+        rope_min_freqs: float = 1.0,
+        rope_max_freqs: float = 20.0,
+        max_seq_len: int = 8192,
+        # Normalization
+        norm_eps: float = 1e-5,
+        use_qk_norm: bool = True,
+        use_tok_norm: bool = True,
+        parameterized_norm: bool = True,
+        # Distillation settings
+        n_storage_tokens: int = 4,
+        teachers: Tuple[str, ...] = ("siglip2", "dinov3"),
+        teachers_dim: Tuple[int, ...] = (1152, 1024),
+        # FlexAttention
+        use_flex_attn: bool = True,
+        **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.n_kv_heads = n_kv_heads
+        self.moe_dim = moe_dim
+        # Default MoEArgs matching your configs.py
+        self.moe_args = moe_args if moe_args is not None else {
+            "num_experts": 16,
+            "num_shared_experts": 1,
+            "top_k": 3,
+            "score_before_experts": False,
+            "route_norm": True,
+            "route_scale": 0.8633,
+            "activation": "relu2",
+            "score_func": "sigmoid",
+        }
+        self.first_n_layers_dense = first_n_layers_dense
+        self.ffn_dim = ffn_dim
+        self.activation = activation
+        self.channel_size = channel_size
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.enable_3d_rope = enable_3d_rope
+        self.rope_theta = rope_theta
+        self.rope_min_freqs = rope_min_freqs
+        self.rope_max_freqs = rope_max_freqs
+        self.max_seq_len = max_seq_len
+        self.norm_eps = norm_eps
+        self.use_qk_norm = use_qk_norm
+        self.use_tok_norm = use_tok_norm
+        self.parameterized_norm = parameterized_norm
+        self.n_storage_tokens = n_storage_tokens
+        self.teachers = teachers
+        self.teachers_dim = teachers_dim
+        self.use_flex_attn = use_flex_attn
+        super().__init__(**kwargs)

image_processing_siglino.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import numpy as np
+import torch
+from PIL import Image
+from typing import List, Optional, Union, Dict
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.utils import logging
+# Local import of your existing logic
+# (Assuming smart_resize and convert_image_to_patches are in the same folder or copied here)
+from .image_processor import smart_resize, convert_image_to_patches, pad_along_first_dim
+logger = logging.get_logger(__name__)
+class SigLinoImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values", "padding_mask", "spatial_shapes"]
+    def __init__(
+        self,
+        patch_size: int = 16,
+        min_pixels: int = 128 * 128,
+        max_pixels: int = 256 * 256,
+        image_mean: Optional[List[float]] = None,
+        image_std: Optional[List[float]] = None,
+        do_resize: bool = True,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.image_mean = image_mean if image_mean is not None else [0.5, 0.5, 0.5]
+        self.image_std = image_std if image_std is not None else [0.5, 0.5, 0.5]
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+    def preprocess_single(self, image: Image.Image) -> Dict:
+        """Standard preprocessing for a single PIL image."""
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        image = image.convert("RGB")
+        width, height = image.size # PIL uses (W, H)
+        # 1. Smart Resize
+        if self.do_resize:
+            resized_height, resized_width = smart_resize(
+                height, width,
+                factor=self.patch_size,
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels,
+            )
+            image = image.resize((resized_width, resized_height), Image.BICUBIC)
+        else:
+            resized_height, resized_width = height, width
+        image_np = np.array(image).astype(np.float32)
+        # 2. Rescale
+        if self.do_rescale:
+            image_np = image_np / 255.0
+        # 3. Normalize
+        if self.do_normalize:
+            mean = np.array(self.image_mean, dtype=np.float32)
+            std = np.array(self.image_std, dtype=np.float32)
+            image_np = (image_np - mean) / std
+        spatial_shape = (resized_height // self.patch_size, resized_width // self.patch_size)
+        # Convert to tensor and patchify
+        img_tensor = torch.from_numpy(image_np)
+        patches = convert_image_to_patches(img_tensor, self.patch_size)
+        return {
+            "patches": patches,
+            "spatial_shape": spatial_shape
+        }
+    def preprocess(
+        self,
+        images: Union[Image.Image, List[Image.Image]],
+        max_num_patches: int = 256,
+        return_tensors: Optional[str] = "pt",
+        **kwargs
+    ) -> BatchFeature:
+        """Main entry point for transformers image processor."""
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+        results = [self.preprocess_single(img) for img in images]
+        batched_pixels = []
+        batched_masks = []
+        batched_shapes = []
+        for res in results:
+            patches = res["patches"]
+            shape = res["spatial_shape"]
+            # Padding logic
+            patches_padded, mask = pad_along_first_dim(
+                patches,
+                max_num_patches,
+                pad_value=0.0
+            )
+            batched_pixels.append(patches_padded)
+            batched_masks.append(mask)
+            batched_shapes.append(list(shape))
+        data = {
+            "pixel_values": torch.stack(batched_pixels),
+            "padding_mask": torch.stack(batched_masks),
+            "spatial_shapes": torch.tensor(batched_shapes)
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)

image_processor.py CHANGED Viewed

@@ -70,8 +70,8 @@ def pad_along_first_dim(
     return array, mask
-class AMOEImageProcessor:
-    """Image processor for AMOE model.
         """
     def __init__(

     return array, mask
+class SigLinoImageProcessor:
+    """Image processor for SigLino model.
         """
     def __init__(

modeling_siglino.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops as E
+from typing import Optional, Dict, Union, Tuple
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutput
+# Relative imports from your local files
+from .configuration_siglino import SigLinoConfig
+from .attention import Attention, create_attention_mask
+from .moe import MoE, FeedForward
+from .rope import (
+    precompute_freqs_cis,
+    precompute_golden_freqs_cis,
+    apply_golden_freqs_cis_to_visual_pos,
+)
+class PytorchGELUTanh(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.gelu(x, approximate="tanh")
+class Siglip2MLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.activation_fn = PytorchGELUTanh()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class Siglip2MultiheadAttentionPoolingHead(nn.Module):
+    def __init__(self, hidden_size: int, num_attention_heads: int, output_dim: int):
+        super().__init__()
+        self.probe = nn.Parameter(torch.randn(1, 1, hidden_size))
+        self.attention = nn.MultiheadAttention(hidden_size, num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(hidden_size, eps=1e-5)
+        self.mlp = Siglip2MLP(hidden_size, 4304)
+        self.num_heads = num_attention_heads
+    def forward(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor | None = None) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        if attention_mask is not None:
+            # Mask expansion logic kept from your original model.py
+            # Note: This uses einops and specific expansion for MHA
+            def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None):
+                bsz, src_len = mask.size()
+                tgt_len = tgt_len if tgt_len is not None else src_len
+                expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+                inverted_mask = torch.tensor(1.0, dtype=dtype, device=mask.device) - expanded_mask
+                return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+            attention_mask = E.rearrange(attention_mask, "(b s) -> b s", b=batch_size)
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _expand_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+class Adapter(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, bias: bool = True):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dim, out_dim)
+        self.norm = nn.LayerNorm(out_dim)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(out_dim, out_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.norm(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, config: SigLinoConfig):
+        super().__init__()
+        self.dim = config.dim
+        self.parameterized_norm = getattr(config, 'parameterized_norm', True)
+        if self.parameterized_norm:
+            self.attention_norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+            self.ffn_norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        self.attention = Attention(
+            dim=config.dim,
+            n_heads=config.n_heads,
+            n_kv_heads=config.n_kv_heads,
+            head_dim=config.head_dim,
+            use_qk_norm=config.use_qk_norm,
+            enable_3d_rope=config.enable_3d_rope,
+            use_flex_attn=config.use_flex_attn,
+            use_sink_attn=True,
+        )
+        # Handle MoE initialization from config dict
+        moe_args = config.moe_args
+        if isinstance(moe_args, dict):
+            from .moe import MoEArgs
+            moe_args = MoEArgs(**moe_args)
+        first_n_dense = getattr(config, 'first_n_layers_dense', 0)
+        use_dense = layer_id < first_n_dense
+        if use_dense:
+            ffn_hidden = getattr(config, 'ffn_dim', None) or config.moe_dim
+            activation = getattr(config, 'activation', 'silu')
+            self.feed_forward = FeedForward(config.dim, ffn_hidden, activation=activation)
+            self.moe_enabled = False
+        elif moe_args and moe_args.num_experts > 0:
+            self.moe = MoE(moe_args, dim=config.dim, hidden_dim=config.moe_dim)
+            self.moe_enabled = True
+        else:
+            self.feed_forward = FeedForward(config.dim, config.moe_dim)
+            self.moe_enabled = False
+        self.weight_init_std = 0.02 / (2 * (layer_id + 1)) ** 0.5
+    def forward(self, x, freqs_cis, freqs_cis_2d=None, pos_thw=None, attention_masks=None, compile=False):
+        if self.parameterized_norm:
+            x_norm = self.attention_norm(x)
+        else:
+            x_norm = F.rms_norm(x, (x.size(-1),))
+        h = x + self.attention(
+            x_norm,
+            freqs_cis,
+            freqs_cis_2d,
+            pos_thw,
+            attention_masks=attention_masks,
+            compile=compile,
+        )
+        h_norm = self.ffn_norm(h) if self.parameterized_norm else F.rms_norm(h, (h.size(-1),))
+        out = h + self.moe(h_norm) if self.moe_enabled else h + self.feed_forward(h_norm)
+        return out
+class SigLinoPreTrainedModel(PreTrainedModel):
+    config_class = SigLinoConfig
+    base_model_prefix = "siglino"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["TransformerBlock"]
+    def _init_weights(self, module):
+        # Weight initialization is handled by the internal init_weights call in __init__
+        pass
+    def _apply(self, fn):
+        # Prevent casting complex RoPE buffers (freqs_cis) to real dtypes on model.to(bf16/fp16)
+        complex_buffers = {}
+        for name, buf in list(self.named_buffers(recurse=False)):
+            if buf is not None and buf.is_complex():
+                complex_buffers[name] = buf
+                del self._buffers[name]
+        ret = super()._apply(fn)
+        for name, buf in complex_buffers.items():
+            dummy = torch.tensor([0.0], device=buf.device)
+            res = fn(dummy)
+            if not res.is_complex():
+                new_buf = buf.to(device=res.device)
+            else:
+                new_buf = fn(buf)
+            persistent = name not in self._non_persistent_buffers_set
+            self.register_buffer(name, new_buf, persistent=persistent)
+        return ret
+class SigLinoModel(SigLinoPreTrainedModel):
+    def __init__(self, config: SigLinoConfig):
+        super().__init__(config)
+        self.config = config
+        self.n_layers = config.n_layers
+        self.patch_size = config.spatial_patch_size
+        self.n_storage_tokens = config.n_storage_tokens
+        # Patch embedding
+        self.n_pixels_per_patch = config.temporal_patch_size * config.spatial_patch_size ** 2
+        self.img_projector = nn.Linear(
+            self.n_pixels_per_patch * config.channel_size,
+            config.dim,
+            bias=False,
+        )
+        self.cls_token = nn.Parameter(torch.empty(1, 1, config.dim))
+        if self.n_storage_tokens > 0:
+            self.storage_tokens = nn.Parameter(torch.empty(1, self.n_storage_tokens, config.dim))
+        # RoPE
+        head_dim = config.head_dim or config.dim // config.n_heads
+        d = head_dim // 2
+        self.register_buffer("freqs_cis_golden", self._precompute_golden_freqs_cis(d, config))
+        self.register_buffer("freqs_cis", self._precompute_freqs_cis(d, config), persistent=False)
+        self.layers = nn.ModuleList([TransformerBlock(i, config) for i in range(config.n_layers)])
+        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        # Teacher adapters
+        teachers_dict = dict(zip(config.teachers, config.teachers_dim))
+        dinov3_dim = teachers_dict.get("dinov3", 1280)
+        siglip2_dim = teachers_dict.get("siglip2", 1152)
+        self.dinov3_adapter = Adapter(config.dim, dinov3_dim, bias=False)
+        self.siglip2_adapter = Adapter(config.dim, siglip2_dim, bias=False)
+        self.layer_norm_dinov3 = nn.LayerNorm(dinov3_dim)
+        self.siglip2_multihead_attention_pooling_head = Siglip2MultiheadAttentionPoolingHead(
+            siglip2_dim, 16, siglip2_dim
+        )
+        self.post_init()
+    def _precompute_freqs_cis(self, head_dim: int, config: SigLinoConfig) -> torch.Tensor:
+        return precompute_freqs_cis(head_dim, config.max_seq_len, config.rope_theta)
+    def _precompute_golden_freqs_cis(self, head_dim: int, config: SigLinoConfig) -> torch.Tensor:
+        return precompute_golden_freqs_cis(
+            config.n_heads, head_dim, config.rope_min_freqs, config.rope_max_freqs
+        )
+    def _get_thw_pos(self, batch_size, num_patches, spatial_shapes, device):
+        N = batch_size
+        R = 1 + self.n_storage_tokens
+        S = R + num_patches
+        tpos = torch.zeros((N, S), dtype=torch.float32, device=device)
+        hpos = torch.zeros((N, S), dtype=torch.float32, device=device)
+        wpos = torch.zeros((N, S), dtype=torch.float32, device=device)
+        for n in range(N):
+            H, W = spatial_shapes[n].tolist()
+            h_coords = torch.arange(H, device=device).float()
+            w_coords = torch.arange(W, device=device).float()
+            xlim, ylim = (W / H) ** 0.5, (H / W) ** 0.5
+            h_norm = -ylim + 2 * ylim * h_coords / max(H - 1, 1)
+            w_norm = -xlim + 2 * xlim * w_coords / max(W - 1, 1)
+            # Vectorized fill for patches
+            h_grid, w_grid = torch.meshgrid(h_norm, w_norm, indexing='ij')
+            hpos[n, R:R+H*W] = h_grid.reshape(-1)
+            wpos[n, R:R+H*W] = w_grid.reshape(-1)
+            hpos[n, :R], wpos[n, :R] = float('nan'), float('nan')
+        return torch.stack([tpos, hpos, wpos], dim=0)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.Tensor] = None,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        compile: bool = True,
+    ) -> Union[Dict, Tuple]:
+        N, L, _ = pixel_values.shape
+        device = pixel_values.device
+        R = 1 + self.n_storage_tokens
+        if padding_mask is None:
+            padding_mask = torch.ones((N, L), dtype=pixel_values.dtype, device=device)
+        h_NLD = self.img_projector(pixel_values)
+        cls_expanded = self.cls_token.expand(N, -1, -1)
+        if self.n_storage_tokens > 0:
+            reg_expanded = self.storage_tokens.expand(N, -1, -1)
+            h_NSD = torch.cat([cls_expanded, reg_expanded, h_NLD], dim=1)
+        else:
+            h_NSD = torch.cat([cls_expanded, h_NLD], dim=1)
+        S = h_NSD.shape[1]
+        cls_reg_mask = torch.ones((N, R), dtype=padding_mask.dtype, device=device)
+        full_mask = torch.cat([cls_reg_mask, padding_mask], dim=1)
+        # FlexAttention Mask
+        def mask_mod(b, h, q_idx, kv_idx):
+            return full_mask.bool()[b, q_idx] & full_mask.bool()[b, kv_idx]
+        block_mask = create_attention_mask(mask_mod, N, None, S, S)
+        # RoPE
+        thw_pos = self._get_thw_pos(N, L, spatial_shapes, device)
+        pos_thw = E.rearrange(thw_pos, "p n s -> n s p").to(dtype=torch.float32)
+        patch_mask_2d = torch.zeros((N, S), dtype=torch.bool, device=device)
+        patch_mask_2d[:, R:] = padding_mask.bool()
+        pos_thw[:, :, 1:] = pos_thw[:, :, 1:].masked_fill(~patch_mask_2d.unsqueeze(-1), float("nan"))
+        freqs_cis_golden = apply_golden_freqs_cis_to_visual_pos(
+            self.freqs_cis_golden.to(dtype=pos_thw.dtype), pos_thw[:, :, 1:]
+        )
+        all_hidden_states = () if output_hidden_states else None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (h_NSD,)
+            h_NSD = layer(h_NSD, self.freqs_cis, freqs_cis_2d=freqs_cis_golden,
+                          pos_thw=pos_thw, attention_masks=block_mask, compile=compile)
+        h_NSD = self.norm(h_NSD)
+        # Feature Extraction & Adapters
+        cls_feats = h_NSD[:, 0]
+        patch_feats = h_NSD[:, R:]
+        student_patch_dinov3 = self.dinov3_adapter(patch_feats)
+        student_patch_siglip = self.siglip2_adapter(patch_feats)
+        student_cls_dinov3 = self.dinov3_adapter(cls_feats)
+        h_sig = self.siglip2_adapter(h_NSD)
+        siglip_attn_mask = full_mask.reshape(-1)
+        student_summary_siglip = self.siglip2_multihead_attention_pooling_head(h_sig, siglip_attn_mask)
+        output = {
+            "last_hidden_state": h_NSD,
+            "patch_features": {
+                "dinov3": student_patch_dinov3,
+                "siglip2": student_patch_siglip,
+                "siglino": patch_feats,
+            },
+            "summary_features": {
+                "dinov3": student_cls_dinov3,
+                "siglip2": student_summary_siglip,
+                "siglino": cls_feats,
+            },
+            "hidden_states": all_hidden_states,
+        }
+        if not return_dict:
+            return tuple(v for v in output.values() if v is not None)
+        return output

preprocessor_config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "auto_map": {
-    "AutoImageProcessor": "image_processing_amoe.AMOEImageProcessor"
   },
   "do_normalize": true,
   "do_rescale": true,
@@ -10,7 +10,7 @@
     0.5,
     0.5
   ],
-  "image_processor_type": "AMOEImageProcessor",
   "image_std": [
     0.5,
     0.5,

 {
   "auto_map": {
+    "AutoImageProcessor": "image_processing_siglino.SigLinoImageProcessor"
   },
   "do_normalize": true,
   "do_rescale": true,
     0.5,
     0.5
   ],
+  "image_processor_type": "SigLinoImageProcessor",
   "image_std": [
     0.5,
     0.5,

utils.py CHANGED Viewed

@@ -9,21 +9,21 @@ from PIL import Image
 from typing import Union, List
 import os
-from .model import AMOE
-from .configs import AMOEArgs, amoe_configs
-from .image_processor import AMOEImageProcessor
-def load_amoe_model(
     checkpoint_path: str,
-    config_name: str = "18-layers-distillation",
     device: Union[str, torch.device] = "cuda",
     dtype: torch.dtype | None = None,
     **kwargs,
-) -> tuple[AMOE, AMOEImageProcessor]:
     """
-    Load a AMOE model from a checkpoint.
     Args:
         checkpoint_path: Path to the model checkpoint
@@ -35,13 +35,13 @@ def load_amoe_model(
         Tuple of (model, image_processor)
     """
     # Get configuration
-    if config_name in amoe_configs:
-        args = amoe_configs[config_name]
     else:
-        raise ValueError(f"Unknown config: {config_name}. Available: {list(amoe_configs.keys())}")
     # Create model
-    model = AMOE(args)
     # Standard PyTorch checkpoint
     state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
@@ -55,7 +55,7 @@ def load_amoe_model(
     model.eval()
     # Create image processor
-    image_processor = AMOEImageProcessor(patch_size=args.spatial_patch_size, **kwargs)
     return model, image_processor
@@ -178,7 +178,7 @@ def load_amoe_model(
 FEATURE_DIM_DICT = {
     "dinov3": 1024,
     "siglip2": 1152,
-    "amoe": 768,  # Model dimension
 }
 PATCH_SIZE = 16

 from typing import Union, List
 import os
+from .model import SigLino
+from .configs import SigLinoArgs, siglino_configs
+from .image_processor import SigLinoImageProcessor
+def load_siglino_model(
     checkpoint_path: str,
+    config_name: str = "siglino-0.3B",
     device: Union[str, torch.device] = "cuda",
     dtype: torch.dtype | None = None,
     **kwargs,
+) -> tuple[SigLino, SigLinoImageProcessor]:
     """
+    Load a SigLino model from a checkpoint.
     Args:
         checkpoint_path: Path to the model checkpoint
         Tuple of (model, image_processor)
     """
     # Get configuration
+    if config_name in siglino_configs:
+        args = siglino_configs[config_name]
     else:
+        raise ValueError(f"Unknown config: {config_name}. Available: {list(siglino_configs.keys())}")
     # Create model
+    model = SigLino(args)
     # Standard PyTorch checkpoint
     state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
     model.eval()
     # Create image processor
+    image_processor = SigLinoImageProcessor(patch_size=args.spatial_patch_size, **kwargs)
     return model, image_processor
 FEATURE_DIM_DICT = {
     "dinov3": 1024,
     "siglip2": 1152,
+    "siglino": 768,  # Model dimension
 }
 PATCH_SIZE = 16