Using safetensors for weights loading

Files changed (4) hide show

README.md +41 -6
config.json +4 -4
megaloc_model.py +603 -192
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -1,17 +1,52 @@
 ---
 tags:
-- model_hub_mixin
 - pytorch_model_hub_mixin
 - arxiv:2502.17237
-license: mit
 ---
 # MegaLoc
-MegaLoc is an image retrieval model for any localization task, which achieves SOTA on most VPR datasets, including indoor and outdoor ones.
-You can find details in our paper [MegaLoc: One Retrieval to Place Them All](https://arxiv.org/abs/2502.17237)
-### Qualitataive examples
-Here are some examples of top-1 retrieved images from the SF-XL test set, which has 2.8M images as database.
 ![teaser](https://github.com/user-attachments/assets/a90b8d4c-ab53-4151-aacc-93493d583713)

 ---
+pipeline_tag: image-feature-extraction
+library_name: pytorch
+license: mit
 tags:
+- visual-place-recognition
+- image-retrieval
 - pytorch_model_hub_mixin
 - arxiv:2502.17237
 ---
 # MegaLoc
+MegaLoc is an image retrieval model for visual place recognition (VPR) that achieves state-of-the-art on most VPR datasets, including indoor and outdoor environments.
+**Paper:** [MegaLoc: One Retrieval to Place Them All](https://arxiv.org/abs/2502.17237) (CVPR 2025 Workshop)
+**GitHub:** [gmberton/MegaLoc](https://github.com/gmberton/MegaLoc)
+## Usage
+```python
+import torch
+model = torch.hub.load("gmberton/MegaLoc", "get_trained_model")
+model.eval()
+# Extract descriptor from an image
+image = torch.randn(1, 3, 322, 322)  # [B, 3, H, W] - any size works
+with torch.no_grad():
+    descriptor = model(image)  # [B, 8448] L2-normalized descriptor
+```
+For benchmarking on VPR datasets, see [VPR-methods-evaluation](https://github.com/gmberton/VPR-methods-evaluation).
+## Qualitative Examples
+Top-1 retrieved images from the SF-XL test set (2.8M database images):
 ![teaser](https://github.com/user-attachments/assets/a90b8d4c-ab53-4151-aacc-93493d583713)
+## Citation
+```bibtex
+@InProceedings{Berton_2025_CVPR,
+    author    = {Berton, Gabriele and Masone, Carlo},
+    title     = {MegaLoc: One Retrieval to Place Them All},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
+    month     = {June},
+    year      = {2025},
+    pages     = {2861-2867}
+}
+```

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "cluster_dim": 256,
   "feat_dim": 8448,
-  "mlp_dim": 512,
   "num_clusters": 64,
-  "token_dim": 256
-}

 {
   "feat_dim": 8448,
   "num_clusters": 64,
+  "cluster_dim": 256,
+  "token_dim": 256,
+  "mlp_dim": 512
+}

megaloc_model.py CHANGED Viewed

@@ -1,256 +1,667 @@
-"""Code for the MegaLoc model.
-Much of the code in this file is from SALAD https://github.com/serizba/salad
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchvision.transforms as tfm
 from huggingface_hub import PyTorchModelHubMixin
-class MegaLocModel(nn.Module, PyTorchModelHubMixin):
     def __init__(
         self,
-        feat_dim=8448,
-        num_clusters=64,
-        cluster_dim=256,
-        token_dim=256,
-        mlp_dim=512,
-    ):
         super().__init__()
-        self.backbone = DINOv2()
-        self.salad_out_dim = num_clusters * cluster_dim + token_dim
-        self.aggregator = Aggregator(
-            feat_dim=feat_dim,
-            agg_config={
-                "num_channels": self.backbone.num_channels,
-                "num_clusters": num_clusters,
-                "cluster_dim": cluster_dim,
-                "token_dim": token_dim,
-                "mlp_dim": mlp_dim,
-            },
-            salad_out_dim=self.salad_out_dim,
         )
-        self.feat_dim = feat_dim
-        self.l2norm = L2Norm()
-    def forward(self, images):
-        b, c, h, w = images.shape
-        if h % 14 != 0 or w % 14 != 0:
-            # DINO needs height and width as multiple of 14, therefore resize them
-            # to the nearest multiple of 14
-            h = round(h / 14) * 14
-            w = round(w / 14) * 14
-            images = tfm.functional.resize(images, [h, w], antialias=True)
-        features = self.aggregator(self.backbone(images))
-        features = self.l2norm(features)
-        return features
-class L2Norm(nn.Module):
-    def __init__(self, dim=1):
         super().__init__()
-        self.dim = dim
-    def forward(self, x):
-        return F.normalize(x, p=2.0, dim=self.dim)
-class Aggregator(nn.Module):
-    def __init__(self, feat_dim, agg_config, salad_out_dim):
         super().__init__()
-        self.agg = SALAD(**agg_config)
-        self.linear = nn.Linear(salad_out_dim, feat_dim)
-    def forward(self, x):
-        x = self.agg(x)
-        return self.linear(x)
-class DINOv2(nn.Module):
-    def __init__(self, num_trainable_blocks=4, norm_layer=True, return_token=True):
         super().__init__()
-        self.model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
-        self.num_channels = 768
-        self.num_trainable_blocks = num_trainable_blocks
-        self.norm_layer = norm_layer
-        self.return_token = return_token
-    def forward(self, x):
         """
-        The forward method for the DINOv2 class
-        Parameters:
-            x (torch.Tensor): The input tensor [B, 3, H, W]. H and W should be divisible by 14.
         Returns:
-            f (torch.Tensor): The feature map [B, C, H // 14, W // 14].
-            t (torch.Tensor): The token [B, C]. This is only returned if return_token is True.
         """
-        B, C, H, W = x.shape
-        x = self.model.prepare_tokens_with_masks(x)
-        # First blocks are frozen
-        with torch.no_grad():
-            for blk in self.model.blocks[: -self.num_trainable_blocks]:
-                x = blk(x)
-        x = x.detach()
-        # Last blocks are trained
-        for blk in self.model.blocks[-self.num_trainable_blocks :]:
-            x = blk(x)
-        if self.norm_layer:
-            x = self.model.norm(x)
-        t = x[:, 0]
-        f = x[:, 1:]
-        # Reshape to (B, C, H, W)
-        f = f.reshape((B, H // 14, W // 14, self.num_channels)).permute(0, 3, 1, 2)
-        if self.return_token:
-            return f, t
-        return f
-# Code adapted from OpenGlue, MIT license
-# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/optimal_transport.py
-def log_otp_solver(log_a, log_b, M, num_iters: int = 20, reg: float = 1.0) -> torch.Tensor:
-    r"""Sinkhorn matrix scaling algorithm for Differentiable Optimal Transport problem.
-    This function solves the optimization problem and returns the OT matrix for the given parameters.
     Args:
-        log_a : torch.Tensor
-            Source weights
-        log_b : torch.Tensor
-            Target weights
-        M : torch.Tensor
-            metric cost matrix
-        num_iters : int, default=100
-            The number of iterations.
-        reg : float, default=1.0
-            regularization value
     """
-    M = M / reg  # regularization
-    u, v = torch.zeros_like(log_a), torch.zeros_like(log_b)
-    for _ in range(num_iters):
-        u = log_a - torch.logsumexp(M + v.unsqueeze(1), dim=2).squeeze()
-        v = log_b - torch.logsumexp(M + u.unsqueeze(2), dim=1).squeeze()
-    return M + u.unsqueeze(2) + v.unsqueeze(1)
-# Code adapted from OpenGlue, MIT license
-# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/superglue.py
-def get_matching_probs(S, dustbin_score=1.0, num_iters=3, reg=1.0):
-    """sinkhorn"""
-    batch_size, m, n = S.size()
-    # augment scores matrix
-    S_aug = torch.empty(batch_size, m + 1, n, dtype=S.dtype, device=S.device)
-    S_aug[:, :m, :n] = S
-    S_aug[:, m, :] = dustbin_score
-    # prepare normalized source and target log-weights
-    norm = -torch.tensor(math.log(n + m), device=S.device)
-    log_a, log_b = norm.expand(m + 1).contiguous(), norm.expand(n).contiguous()
-    log_a[-1] = log_a[-1] + math.log(n - m)
-    log_a, log_b = log_a.expand(batch_size, -1), log_b.expand(batch_size, -1)
-    log_P = log_otp_solver(log_a, log_b, S_aug, num_iters=num_iters, reg=reg)
-    return log_P - norm
-class SALAD(nn.Module):
-    """
-    This class represents the Sinkhorn Algorithm for Locally Aggregated Descriptors (SALAD) model.
-    Attributes:
-        num_channels (int): The number of channels of the inputs (d).
-        num_clusters (int): The number of clusters in the model (m).
-        cluster_dim (int): The number of channels of the clusters (l).
-        token_dim (int): The dimension of the global scene token (g).
-        dropout (float): The dropout rate.
     """
     def __init__(
         self,
-        num_channels=1536,
-        num_clusters=64,
-        cluster_dim=128,
-        token_dim=256,
-        mlp_dim=512,
-        dropout=0.3,
-    ) -> None:
         super().__init__()
-        self.num_channels = num_channels
-        self.num_clusters = num_clusters
-        self.cluster_dim = cluster_dim
-        self.token_dim = token_dim
-        self.mlp_dim = mlp_dim
-        if dropout > 0:
-            dropout = nn.Dropout(dropout)
-        else:
-            dropout = nn.Identity()
-        # MLP for global scene token g
-        self.token_features = nn.Sequential(
-            nn.Linear(self.num_channels, self.mlp_dim), nn.ReLU(), nn.Linear(self.mlp_dim, self.token_dim)
-        )
-        # MLP for local features f_i
-        self.cluster_features = nn.Sequential(
-            nn.Conv2d(self.num_channels, self.mlp_dim, 1),
-            dropout,
-            nn.ReLU(),
-            nn.Conv2d(self.mlp_dim, self.cluster_dim, 1),
-        )
-        # MLP for score matrix S
-        self.score = nn.Sequential(
-            nn.Conv2d(self.num_channels, self.mlp_dim, 1),
-            dropout,
-            nn.ReLU(),
-            nn.Conv2d(self.mlp_dim, self.num_clusters, 1),
         )
-        # Dustbin parameter z
-        self.dust_bin = nn.Parameter(torch.tensor(1.0))
-    def forward(self, x):
-        """
-        x (tuple): A tuple containing two elements, f and t.
-            (torch.Tensor): The feature tensors (t_i) [B, C, H // 14, W // 14].
-            (torch.Tensor): The token tensor (t_{n+1}) [B, C].
         Returns:
-            f (torch.Tensor): The global descriptor [B, m*l + g]
         """
-        x, t = x  # Extract features and token
-        f = self.cluster_features(x).flatten(2)
-        p = self.score(x).flatten(2)
-        t = self.token_features(t)
-        # Sinkhorn algorithm
-        p = get_matching_probs(p, self.dust_bin, 3)
-        p = torch.exp(p)
-        # Normalize to maintain mass
-        p = p[:, :-1, :]
-        p = p.unsqueeze(1).repeat(1, self.cluster_dim, 1, 1)
-        f = f.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)
-        f = torch.cat(
-            [
-                nn.functional.normalize(t, p=2, dim=-1),
-                nn.functional.normalize((f * p).sum(dim=-1), p=2, dim=1).flatten(1),
-            ],
-            dim=-1,
-        )
-        return nn.functional.normalize(f, p=2, dim=-1)

+"""MegaLoc: One Retrieval to Place Them All
+This module implements the MegaLoc model for visual place recognition.
+The model combines a Vision Transformer backbone with an optimal transport-based
+feature aggregation module.
+Paper: https://arxiv.org/abs/2502.17237
+License: MIT
 """
 import math
+from typing import Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torchvision.transforms.functional as tfm
 from huggingface_hub import PyTorchModelHubMixin
+# ==============================================================================
+# Optimal Transport Feature Aggregation
+# ==============================================================================
+# The following implements an optimal transport-based feature aggregation module
+# that converts local patch features into a compact global descriptor.
+# ==============================================================================
+def sinkhorn_log_iterations(
+    source_log_weights: torch.Tensor,
+    target_log_weights: torch.Tensor,
+    cost_matrix: torch.Tensor,
+    num_iterations: int = 20,
+    regularization: float = 1.0,
+) -> torch.Tensor:
+    """Compute optimal transport plan using Sinkhorn iterations in log space.
+    This implements the Sinkhorn-Knopp algorithm for computing the entropy-regularized
+    optimal transport plan between two distributions. The log-space formulation
+    provides numerical stability.
+    Args:
+        source_log_weights: Log of source distribution weights [batch, m+1]
+        target_log_weights: Log of target distribution weights [batch, n]
+        cost_matrix: Cost/score matrix [batch, m+1, n]
+        num_iterations: Number of Sinkhorn iterations
+        regularization: Entropy regularization strength
+    Returns:
+        Log of the transport plan matrix [batch, m+1, n]
+    """
+    # Apply regularization scaling
+    scaled_costs = cost_matrix / regularization
+    # Initialize dual variables
+    dual_source = torch.zeros_like(source_log_weights)
+    dual_target = torch.zeros_like(target_log_weights)
+    # Sinkhorn iterations: alternating row and column normalization
+    for _ in range(num_iterations):
+        # Row normalization (update source dual)
+        dual_source = source_log_weights - torch.logsumexp(scaled_costs + dual_target.unsqueeze(1), dim=2).squeeze()
+        # Column normalization (update target dual)
+        dual_target = target_log_weights - torch.logsumexp(scaled_costs + dual_source.unsqueeze(2), dim=1).squeeze()
+    # Compute final transport plan
+    transport_plan = scaled_costs + dual_source.unsqueeze(2) + dual_target.unsqueeze(1)
+    return transport_plan
+def compute_soft_assignments(
+    affinity_scores: torch.Tensor,
+    slack_logit: float = 1.0,
+    num_iterations: int = 3,
+    regularization: float = 1.0,
+) -> torch.Tensor:
+    """Compute soft cluster assignments using optimal transport with slack.
+    Augments the affinity matrix with a slack row to handle unassigned features,
+    then applies Sinkhorn normalization to get valid transport probabilities.
+    Args:
+        affinity_scores: Raw affinity scores [batch, num_clusters, num_patches]
+        slack_logit: Initial logit value for the slack row
+        num_iterations: Number of Sinkhorn iterations
+        regularization: Entropy regularization strength
+    Returns:
+        Log-probabilities of assignments [batch, num_clusters+1, num_patches]
+    """
+    batch_size, num_clusters, num_patches = affinity_scores.size()
+    # Augment score matrix with slack row for handling outliers/unmatched
+    augmented_scores = torch.empty(
+        batch_size,
+        num_clusters + 1,
+        num_patches,
+        dtype=affinity_scores.dtype,
+        device=affinity_scores.device,
+    )
+    augmented_scores[:, :num_clusters, :num_patches] = affinity_scores
+    augmented_scores[:, num_clusters, :] = slack_logit
+    # Prepare log-weights for source (clusters + slack) and target (patches)
+    log_normalization = -torch.tensor(math.log(num_patches + num_clusters), device=affinity_scores.device)
+    # Source weights: uniform over clusters, extra mass on slack
+    source_log = log_normalization.expand(num_clusters + 1).contiguous()
+    source_log = source_log.clone()
+    source_log[-1] = source_log[-1] + math.log(num_patches - num_clusters)
+    # Target weights: uniform over patches
+    target_log = log_normalization.expand(num_patches).contiguous()
+    # Expand to batch dimension
+    source_log = source_log.expand(batch_size, -1)
+    target_log = target_log.expand(batch_size, -1)
+    # Solve optimal transport
+    log_transport = sinkhorn_log_iterations(
+        source_log,
+        target_log,
+        augmented_scores,
+        num_iterations=num_iterations,
+        regularization=regularization,
+    )
+    return log_transport - log_normalization
+class FeatureAggregationHead(nn.Module):
+    """Optimal transport-based aggregation of local features into global descriptor.
+    This module learns to aggregate local patch features into a compact global
+    representation using differentiable optimal transport. It produces:
+    1. A global scene token from the CLS token
+    2. Cluster-aggregated local descriptors weighted by transport probabilities
+    The final descriptor is the L2-normalized concatenation of both components.
+    Args:
+        input_channels: Number of input feature channels (from backbone)
+        num_clusters: Number of learned cluster centers
+        cluster_channels: Dimensionality of each cluster descriptor
+        global_token_dim: Dimensionality of the global scene token
+        hidden_dim: Hidden dimension for MLPs
+        dropout_rate: Dropout probability (0 to disable)
+    """
     def __init__(
         self,
+        input_channels: int = 1536,
+        num_clusters: int = 64,
+        cluster_channels: int = 128,
+        global_token_dim: int = 256,
+        hidden_dim: int = 512,
+        dropout_rate: float = 0.3,
+    ) -> None:
         super().__init__()
+        self.input_channels = input_channels
+        self.num_clusters = num_clusters
+        self.cluster_channels = cluster_channels
+        self.global_token_dim = global_token_dim
+        self.hidden_dim = hidden_dim
+        # Dropout layer (or identity if disabled)
+        regularization = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
+        # MLP to project CLS token to global scene descriptor
+        self.global_token_mlp = nn.Sequential(
+            nn.Linear(self.input_channels, self.hidden_dim),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.global_token_dim),
         )
+        # Convolutional MLP to project patch features to cluster descriptors
+        self.descriptor_projection = nn.Sequential(
+            nn.Conv2d(self.input_channels, self.hidden_dim, 1),
+            regularization,
+            nn.ReLU(),
+            nn.Conv2d(self.hidden_dim, self.cluster_channels, 1),
+        )
+        # Convolutional MLP to compute cluster assignment logits
+        self.assignment_head = nn.Sequential(
+            nn.Conv2d(self.input_channels, self.hidden_dim, 1),
+            regularization,
+            nn.ReLU(),
+            nn.Conv2d(self.hidden_dim, self.num_clusters, 1),
+        )
+        # Learnable slack variable for optimal transport
+        self.slack_variable = nn.Parameter(torch.tensor(1.0))
+    def forward(self, inputs):
+        """Aggregate local and global features into compact descriptor.
+        Args:
+            inputs: Tuple of (patch_features, cls_token)
+                - patch_features: [B, C, H, W] spatial feature map
+                - cls_token: [B, C] global CLS token
+        Returns:
+            Global descriptor [B, num_clusters * cluster_channels + global_token_dim]
+        """
+        patch_features, cls_token = inputs
+        # Project patch features to cluster descriptors: [B, cluster_channels, H*W]
+        local_descriptors = self.descriptor_projection(patch_features).flatten(2)
+        # Compute assignment logits: [B, num_clusters, H*W]
+        assignment_logits = self.assignment_head(patch_features).flatten(2)
+        # Project CLS token to global descriptor: [B, global_token_dim]
+        global_descriptor = self.global_token_mlp(cls_token)
+        # Compute soft assignments via optimal transport
+        log_assignments = compute_soft_assignments(assignment_logits, self.slack_variable, num_iterations=3)
+        assignments = torch.exp(log_assignments)
+        # Remove slack row (keep only cluster assignments)
+        assignments = assignments[:, :-1, :]
+        # Aggregate local descriptors weighted by assignments
+        # assignments: [B, num_clusters, num_patches]
+        # local_descriptors: [B, cluster_channels, num_patches]
+        # We want: [B, cluster_channels, num_clusters]
+        assignments = assignments.unsqueeze(1).repeat(1, self.cluster_channels, 1, 1)
+        local_descriptors = local_descriptors.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)
+        # Weighted sum over patches for each cluster
+        aggregated_clusters = (local_descriptors * assignments).sum(dim=-1)
+        # Normalize and concatenate
+        normalized_global = F.normalize(global_descriptor, p=2, dim=-1)
+        normalized_local = F.normalize(aggregated_clusters, p=2, dim=1).flatten(1)
+        combined = torch.cat([normalized_global, normalized_local], dim=-1)
+        return F.normalize(combined, p=2, dim=-1)
+# ==============================================================================
+# Vision Transformer Components
+# ==============================================================================
+class PatchEmbedding(nn.Module):
+    """Convert image patches to embeddings using a convolutional layer."""
+    def __init__(
+        self,
+        image_size: int = 518,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 768,
+    ):
         super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [B, C, H, W] -> [B, embed_dim, H/patch_size, W/patch_size]
+        x = self.proj(x)
+        # Flatten spatial dimensions: [B, embed_dim, num_patches]
+        x = x.flatten(2)
+        # Transpose to [B, num_patches, embed_dim]
+        x = x.transpose(1, 2)
+        return x
+class LayerScale(nn.Module):
+    """Learnable per-channel scaling as used in CaiT and DINOv2."""
+    def __init__(self, dim: int, init_value: float = 1e-5):
         super().__init__()
+        self.gamma = nn.Parameter(init_value * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.gamma
+class MultiHeadAttention(nn.Module):
+    """Multi-head self-attention module."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 12,
+        qkv_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ):
         super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        # Compute Q, K, V
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(2, 0, 3, 1, 4)  # [3, B, num_heads, N, head_dim]
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # Scaled dot-product attention
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        # Apply attention to values
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MLP(nn.Module):
+    """MLP module with GELU activation."""
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        out_features: int = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class TransformerBlock(nn.Module):
+    """Vision Transformer block with LayerScale."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: float = 1e-5,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_value=init_values)
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = MLP(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            drop=drop,
+        )
+        self.ls2 = LayerScale(dim, init_value=init_values)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.ls1(self.attn(self.norm1(x)))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+class VisionTransformerBackbone(nn.Module):
+    """DINOv2 Vision Transformer backbone for feature extraction.
+    This implements a ViT-B/14 architecture compatible with DINOv2 weights.
+    The positional encoding interpolation matches the Facebook implementation
+    for exact output compatibility.
+    """
+    def __init__(
+        self,
+        image_size: int = 518,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_channels = embed_dim  # For compatibility
+        # Patch embedding
+        self.patch_embed = PatchEmbedding(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+        )
+        # Positional encoding interpolation parameters (matching Facebook's DINO)
+        self.interpolate_offset = 0.1
+        self.interpolate_antialias = False
+        # Class token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # Positional embedding (for 37x37 = 1369 patches + 1 CLS token = 1370)
+        num_patches = (image_size // patch_size) ** 2
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+        # Final layer norm
+        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
+    def interpolate_pos_encoding(self, x: torch.Tensor, w: int, h: int) -> torch.Tensor:
+        """Interpolate positional encoding for different input sizes.
+        This matches the Facebook DINOv2 implementation exactly, including
+        the interpolation offset kludge for backward compatibility.
         """
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1  # Exclude CLS token
+        N = self.pos_embed.shape[1] - 1  # Number of patches in pos_embed
+        # If input matches training resolution, return as-is
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Original number of patches per dimension
+        # Use scale_factor with offset for backward compatibility
+        # This is the "kludge" from Facebook's DINO implementation
+        sx = float(w0 + self.interpolate_offset) / M
+        sy = float(h0 + self.interpolate_offset) / M
+        patch_pos_embed = F.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens(self, x: torch.Tensor) -> torch.Tensor:
+        """Prepare input tokens with positional encoding."""
+        B, C, W, H = x.shape
+        # Patch embedding
+        x = self.patch_embed(x)
+        # Add CLS token
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # Add positional encoding
+        x = x + self.interpolate_pos_encoding(x, W, H)
+        return x
+    def forward(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Extract features from images.
+        Args:
+            images: Input images [B, 3, H, W] where H, W are multiples of 14
         Returns:
+            Tuple of:
+                - patch_features: [B, 768, H//14, W//14] spatial feature map
+                - cls_token: [B, 768] global CLS token
         """
+        batch_size, _, height, width = images.shape
+        # Prepare tokens with positional encoding
+        x = self.prepare_tokens(images)
+        # Apply transformer blocks
+        for block in self.blocks:
+            x = block(x)
+        # Apply final layer norm
+        x = self.norm(x)
+        # Extract CLS token and patch tokens
+        cls_token = x[:, 0]
+        patch_tokens = x[:, 1:]
+        # Reshape patch tokens to spatial format
+        h_patches = height // self.patch_size
+        w_patches = width // self.patch_size
+        patch_features = patch_tokens.reshape(batch_size, h_patches, w_patches, self.embed_dim).permute(0, 3, 1, 2)
+        return patch_features, cls_token
+# ==============================================================================
+# Feature Dimension Reduction
+# ==============================================================================
+class DescriptorAggregator(nn.Module):
+    """Wrapper combining feature aggregation with linear projection.
+    Applies the optimal transport aggregation followed by a linear layer
+    to reduce dimensionality to the desired output size.
     Args:
+        output_dim: Final descriptor dimensionality
+        aggregator_config: Configuration for FeatureAggregationHead
+        aggregator_output_dim: Output dimension of the aggregation head
     """
+    def __init__(self, output_dim: int, aggregator_config: dict, aggregator_output_dim: int):
+        super().__init__()
+        self.aggregation = FeatureAggregationHead(**aggregator_config)
+        self.projection = nn.Linear(aggregator_output_dim, output_dim)
+    def forward(self, x):
+        aggregated = self.aggregation(x)
+        return self.projection(aggregated)
+# ==============================================================================
+# L2 Normalization Layer
+# ==============================================================================
+class L2Normalize(nn.Module):
+    """L2 normalization layer."""
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.normalize(x, p=2, dim=self.dim)
+# ==============================================================================
+# Main Model
+# ==============================================================================
+class MegaLoc(nn.Module, PyTorchModelHubMixin):
+    """MegaLoc: Unified visual place recognition model.
+    Combines a DINOv2 Vision Transformer backbone with optimal transport-based
+    feature aggregation to produce compact, discriminative image descriptors
+    for place recognition and image retrieval tasks.
+    Args:
+        feat_dim: Output descriptor dimensionality (default: 8448)
+        num_clusters: Number of cluster centers for aggregation (default: 64)
+        cluster_dim: Dimensionality of cluster descriptors (default: 256)
+        token_dim: Dimensionality of global scene token (default: 256)
+        mlp_dim: Hidden dimension for MLPs (default: 512)
+    Example:
+        >>> model = MegaLoc.from_pretrained("gberton/MegaLoc")
+        >>> model.eval()
+        >>> image = torch.randn(1, 3, 322, 322)  # Will auto-resize to 322x322
+        >>> descriptor = model(image)  # [1, 8448]
     """
     def __init__(
         self,
+        feat_dim: int = 8448,
+        num_clusters: int = 64,
+        cluster_dim: int = 256,
+        token_dim: int = 256,
+        mlp_dim: int = 512,
+    ):
         super().__init__()
+        self.backbone = VisionTransformerBackbone()
+        # Aggregator output: num_clusters * cluster_dim + token_dim
+        self.aggregator_output_dim = num_clusters * cluster_dim + token_dim
+        self.aggregator = DescriptorAggregator(
+            output_dim=feat_dim,
+            aggregator_config={
+                "input_channels": self.backbone.num_channels,
+                "num_clusters": num_clusters,
+                "cluster_channels": cluster_dim,
+                "global_token_dim": token_dim,
+                "hidden_dim": mlp_dim,
+            },
+            aggregator_output_dim=self.aggregator_output_dim,
         )
+        self.feat_dim = feat_dim
+        self.normalize = L2Normalize()
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """Extract global descriptor from images.
+        Args:
+            images: Input images [B, 3, H, W]
         Returns:
+            L2-normalized descriptors [B, feat_dim]
         """
+        batch_size, channels, height, width = images.shape
+        # Ensure dimensions are multiples of 14 (ViT patch size)
+        if height % 14 != 0 or width % 14 != 0:
+            height = round(height / 14) * 14
+            width = round(width / 14) * 14
+            images = tfm.resize(images, [height, width], antialias=True)
+        # Extract backbone features
+        features = self.backbone(images)
+        # Aggregate into global descriptor
+        descriptor = self.aggregator(features)
+        # Final L2 normalization
+        return self.normalize(descriptor)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f84357772ac8c92eedb86267afe013fde9ab68bb9dbe478866d08fe04c38684
-size 914581652

 version https://git-lfs.github.com/spec/v1
+oid sha256:9d8716ac9959a86e00494f605a4be46aebed15694ab4ad77c27b91ada9ab51e4
+size 914577620