gberton
/

MegaLoc

@@ -18,227 +18,143 @@ import torchvision.transforms.functional as tfm
 from huggingface_hub import PyTorchModelHubMixin
-# ==============================================================================
-# Optimal Transport Feature Aggregation
-# ==============================================================================
-# The following implements an optimal transport-based feature aggregation module
-# that converts local patch features into a compact global descriptor.
-# ==============================================================================
-def sinkhorn_log_iterations(
-    source_log_weights: torch.Tensor,
-    target_log_weights: torch.Tensor,
-    cost_matrix: torch.Tensor,
-    num_iterations: int = 20,
-    regularization: float = 1.0,
-) -> torch.Tensor:
-    """Compute optimal transport plan using Sinkhorn iterations in log space.
-    This implements the Sinkhorn-Knopp algorithm for computing the entropy-regularized
-    optimal transport plan between two distributions. The log-space formulation
-    provides numerical stability.
     Args:
-        source_log_weights: Log of source distribution weights [batch, m+1]
-        target_log_weights: Log of target distribution weights [batch, n]
-        cost_matrix: Cost/score matrix [batch, m+1, n]
-        num_iterations: Number of Sinkhorn iterations
-        regularization: Entropy regularization strength
-    Returns:
-        Log of the transport plan matrix [batch, m+1, n]
     """
-    # Apply regularization scaling
-    scaled_costs = cost_matrix / regularization
-    # Initialize dual variables
-    dual_source = torch.zeros_like(source_log_weights)
-    dual_target = torch.zeros_like(target_log_weights)
-    # Sinkhorn iterations: alternating row and column normalization
-    for _ in range(num_iterations):
-        # Row normalization (update source dual)
-        dual_source = source_log_weights - torch.logsumexp(scaled_costs + dual_target.unsqueeze(1), dim=2).squeeze()
-        # Column normalization (update target dual)
-        dual_target = target_log_weights - torch.logsumexp(scaled_costs + dual_source.unsqueeze(2), dim=1).squeeze()
-    # Compute final transport plan
-    transport_plan = scaled_costs + dual_source.unsqueeze(2) + dual_target.unsqueeze(1)
-    return transport_plan
-def compute_soft_assignments(
-    affinity_scores: torch.Tensor,
-    slack_logit: float = 1.0,
-    num_iterations: int = 3,
-    regularization: float = 1.0,
-) -> torch.Tensor:
-    """Compute soft cluster assignments using optimal transport with slack.
-    Augments the affinity matrix with a slack row to handle unassigned features,
-    then applies Sinkhorn normalization to get valid transport probabilities.
-    Args:
-        affinity_scores: Raw affinity scores [batch, num_clusters, num_patches]
-        slack_logit: Initial logit value for the slack row
-        num_iterations: Number of Sinkhorn iterations
-        regularization: Entropy regularization strength
-    Returns:
-        Log-probabilities of assignments [batch, num_clusters+1, num_patches]
-    """
-    batch_size, num_clusters, num_patches = affinity_scores.size()
-    # Augment score matrix with slack row for handling outliers/unmatched
-    augmented_scores = torch.empty(
-        batch_size,
-        num_clusters + 1,
-        num_patches,
-        dtype=affinity_scores.dtype,
-        device=affinity_scores.device,
-    )
-    augmented_scores[:, :num_clusters, :num_patches] = affinity_scores
-    augmented_scores[:, num_clusters, :] = slack_logit
-    # Prepare log-weights for source (clusters + slack) and target (patches)
-    log_normalization = -torch.tensor(math.log(num_patches + num_clusters), device=affinity_scores.device)
-    # Source weights: uniform over clusters, extra mass on slack
-    source_log = log_normalization.expand(num_clusters + 1).contiguous()
-    source_log = source_log.clone()
-    source_log[-1] = source_log[-1] + math.log(num_patches - num_clusters)
-    # Target weights: uniform over patches
-    target_log = log_normalization.expand(num_patches).contiguous()
-    # Expand to batch dimension
-    source_log = source_log.expand(batch_size, -1)
-    target_log = target_log.expand(batch_size, -1)
-    # Solve optimal transport
-    log_transport = sinkhorn_log_iterations(
-        source_log,
-        target_log,
-        augmented_scores,
-        num_iterations=num_iterations,
-        regularization=regularization,
-    )
-    return log_transport - log_normalization
-class FeatureAggregationHead(nn.Module):
     """Optimal transport-based aggregation of local features into global descriptor.
-    This module learns to aggregate local patch features into a compact global
-    representation using differentiable optimal transport. It produces:
-    1. A global scene token from the CLS token
-    2. Cluster-aggregated local descriptors weighted by transport probabilities
-    The final descriptor is the L2-normalized concatenation of both components.
     Args:
-        input_channels: Number of input feature channels (from backbone)
-        num_clusters: Number of learned cluster centers
-        cluster_channels: Dimensionality of each cluster descriptor
-        global_token_dim: Dimensionality of the global scene token
-        hidden_dim: Hidden dimension for MLPs
-        dropout_rate: Dropout probability (0 to disable)
     """
     def __init__(
         self,
-        input_channels: int = 1536,
-        num_clusters: int = 64,
-        cluster_channels: int = 128,
-        global_token_dim: int = 256,
-        hidden_dim: int = 512,
-        dropout_rate: float = 0.3,
     ) -> None:
         super().__init__()
-        self.input_channels = input_channels
         self.num_clusters = num_clusters
-        self.cluster_channels = cluster_channels
-        self.global_token_dim = global_token_dim
-        self.hidden_dim = hidden_dim
-        # Dropout layer (or identity if disabled)
-        regularization = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-        # MLP to project CLS token to global scene descriptor
-        self.global_token_mlp = nn.Sequential(
-            nn.Linear(self.input_channels, self.hidden_dim),
-            nn.ReLU(),
-            nn.Linear(self.hidden_dim, self.global_token_dim),
         )
-        # Convolutional MLP to project patch features to cluster descriptors
-        self.descriptor_projection = nn.Sequential(
-            nn.Conv2d(self.input_channels, self.hidden_dim, 1),
-            regularization,
             nn.ReLU(),
-            nn.Conv2d(self.hidden_dim, self.cluster_channels, 1),
         )
-        # Convolutional MLP to compute cluster assignment logits
-        self.assignment_head = nn.Sequential(
-            nn.Conv2d(self.input_channels, self.hidden_dim, 1),
-            regularization,
             nn.ReLU(),
-            nn.Conv2d(self.hidden_dim, self.num_clusters, 1),
         )
-        # Learnable slack variable for optimal transport
-        self.slack_variable = nn.Parameter(torch.tensor(1.0))
-    def forward(self, inputs):
-        """Aggregate local and global features into compact descriptor.
         Args:
-            inputs: Tuple of (patch_features, cls_token)
-                - patch_features: [B, C, H, W] spatial feature map
-                - cls_token: [B, C] global CLS token
         Returns:
-            Global descriptor [B, num_clusters * cluster_channels + global_token_dim]
         """
-        patch_features, cls_token = inputs
-        # Project patch features to cluster descriptors: [B, cluster_channels, H*W]
-        local_descriptors = self.descriptor_projection(patch_features).flatten(2)
-        # Compute assignment logits: [B, num_clusters, H*W]
-        assignment_logits = self.assignment_head(patch_features).flatten(2)
-        # Project CLS token to global descriptor: [B, global_token_dim]
-        global_descriptor = self.global_token_mlp(cls_token)
-        # Compute soft assignments via optimal transport
-        log_assignments = compute_soft_assignments(assignment_logits, self.slack_variable, num_iterations=3)
-        assignments = torch.exp(log_assignments)
-        # Remove slack row (keep only cluster assignments)
-        assignments = assignments[:, :-1, :]
-        # Aggregate local descriptors weighted by assignments
-        # assignments: [B, num_clusters, num_patches]
-        # local_descriptors: [B, cluster_channels, num_patches]
-        # We want: [B, cluster_channels, num_clusters]
-        assignments = assignments.unsqueeze(1).repeat(1, self.cluster_channels, 1, 1)
-        local_descriptors = local_descriptors.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)
-        # Weighted sum over patches for each cluster
-        aggregated_clusters = (local_descriptors * assignments).sum(dim=-1)
-        # Normalize and concatenate
-        normalized_global = F.normalize(global_descriptor, p=2, dim=-1)
-        normalized_local = F.normalize(aggregated_clusters, p=2, dim=1).flatten(1)
-        combined = torch.cat([normalized_global, normalized_local], dim=-1)
-        return F.normalize(combined, p=2, dim=-1)
 # ==============================================================================
@@ -249,13 +165,7 @@ class FeatureAggregationHead(nn.Module):
 class PatchEmbedding(nn.Module):
     """Convert image patches to embeddings using a convolutional layer."""
-    def __init__(
-        self,
-        image_size: int = 518,
-        patch_size: int = 14,
-        in_channels: int = 3,
-        embed_dim: int = 768,
-    ):
         super().__init__()
         self.image_size = image_size
         self.patch_size = patch_size
@@ -263,11 +173,8 @@ class PatchEmbedding(nn.Module):
         self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: [B, C, H, W] -> [B, embed_dim, H/patch_size, W/patch_size]
         x = self.proj(x)
-        # Flatten spatial dimensions: [B, embed_dim, num_patches]
         x = x.flatten(2)
-        # Transpose to [B, num_patches, embed_dim]
         x = x.transpose(1, 2)
         return x
@@ -287,12 +194,7 @@ class MultiHeadAttention(nn.Module):
     """Multi-head self-attention module."""
     def __init__(
-        self,
-        dim: int,
-        num_heads: int = 12,
-        qkv_bias: bool = True,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
     ):
         super().__init__()
         self.num_heads = num_heads
@@ -307,17 +209,14 @@ class MultiHeadAttention(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
-        # Compute Q, K, V
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
-        qkv = qkv.permute(2, 0, 3, 1, 4)  # [3, B, num_heads, N, head_dim]
         q, k, v = qkv[0], qkv[1], qkv[2]
-        # Scaled dot-product attention
         attn = (q @ k.transpose(-2, -1)) * self.scale
         attn = attn.softmax(dim=-1)
         attn = self.attn_drop(attn)
-        # Apply attention to values
         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
         x = self.proj(x)
         x = self.proj_drop(x)
@@ -328,13 +227,7 @@ class MultiHeadAttention(nn.Module):
 class MLP(nn.Module):
     """MLP module with GELU activation."""
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: int = None,
-        out_features: int = None,
-        drop: float = 0.0,
-    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
@@ -368,21 +261,11 @@ class TransformerBlock(nn.Module):
     ):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim, eps=1e-6)
-        self.attn = MultiHeadAttention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-        )
         self.ls1 = LayerScale(dim, init_value=init_values)
         self.norm2 = nn.LayerNorm(dim, eps=1e-6)
-        self.mlp = MLP(
-            in_features=dim,
-            hidden_features=int(dim * mlp_ratio),
-            drop=drop,
-        )
         self.ls2 = LayerScale(dim, init_value=init_values)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -391,12 +274,10 @@ class TransformerBlock(nn.Module):
         return x
-class VisionTransformerBackbone(nn.Module):
     """DINOv2 Vision Transformer backbone for feature extraction.
     This implements a ViT-B/14 architecture compatible with DINOv2 weights.
-    The positional encoding interpolation matches the Facebook implementation
-    for exact output compatibility.
     """
     def __init__(
@@ -413,54 +294,34 @@ class VisionTransformerBackbone(nn.Module):
         super().__init__()
         self.patch_size = patch_size
         self.embed_dim = embed_dim
-        self.num_channels = embed_dim  # For compatibility
-        # Patch embedding
         self.patch_embed = PatchEmbedding(
-            image_size=image_size,
-            patch_size=patch_size,
-            in_channels=in_channels,
-            embed_dim=embed_dim,
         )
-        # Positional encoding interpolation parameters (matching Facebook's DINO)
         self.interpolate_offset = 0.1
         self.interpolate_antialias = False
-        # Class token
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        # Positional embedding (for 37x37 = 1369 patches + 1 CLS token = 1370)
         num_patches = (image_size // patch_size) ** 2
         self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
-        # Transformer blocks
         self.blocks = nn.ModuleList(
             [
-                TransformerBlock(
-                    dim=embed_dim,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                )
                 for _ in range(depth)
             ]
         )
-        # Final layer norm
         self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
     def interpolate_pos_encoding(self, x: torch.Tensor, w: int, h: int) -> torch.Tensor:
-        """Interpolate positional encoding for different input sizes.
-        This matches the Facebook DINOv2 implementation exactly, including
-        the interpolation offset kludge for backward compatibility.
-        """
         previous_dtype = x.dtype
-        npatch = x.shape[1] - 1  # Exclude CLS token
-        N = self.pos_embed.shape[1] - 1  # Number of patches in pos_embed
-        # If input matches training resolution, return as-is
         if npatch == N and w == h:
             return self.pos_embed
@@ -471,10 +332,8 @@ class VisionTransformerBackbone(nn.Module):
         dim = x.shape[-1]
         w0 = w // self.patch_size
         h0 = h // self.patch_size
-        M = int(math.sqrt(N))  # Original number of patches per dimension
-        # Use scale_factor with offset for backward compatibility
-        # This is the "kludge" from Facebook's DINO implementation
         sx = float(w0 + self.interpolate_offset) / M
         sy = float(h0 + self.interpolate_offset) / M
@@ -490,22 +349,6 @@ class VisionTransformerBackbone(nn.Module):
         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
-    def prepare_tokens(self, x: torch.Tensor) -> torch.Tensor:
-        """Prepare input tokens with positional encoding."""
-        B, C, W, H = x.shape
-        # Patch embedding
-        x = self.patch_embed(x)
-        # Add CLS token
-        cls_tokens = self.cls_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1)
-        # Add positional encoding
-        x = x + self.interpolate_pos_encoding(x, W, H)
-        return x
     def forward(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Extract features from images.
@@ -513,80 +356,52 @@ class VisionTransformerBackbone(nn.Module):
             images: Input images [B, 3, H, W] where H, W are multiples of 14
         Returns:
-            Tuple of:
-                - patch_features: [B, 768, H//14, W//14] spatial feature map
-                - cls_token: [B, 768] global CLS token
         """
-        batch_size, _, height, width = images.shape
-        # Prepare tokens with positional encoding
-        x = self.prepare_tokens(images)
-        # Apply transformer blocks
         for block in self.blocks:
             x = block(x)
-        # Apply final layer norm
         x = self.norm(x)
-        # Extract CLS token and patch tokens
         cls_token = x[:, 0]
         patch_tokens = x[:, 1:]
-        # Reshape patch tokens to spatial format
-        h_patches = height // self.patch_size
-        w_patches = width // self.patch_size
-        patch_features = patch_tokens.reshape(batch_size, h_patches, w_patches, self.embed_dim).permute(0, 3, 1, 2)
         return patch_features, cls_token
 # ==============================================================================
-# Feature Dimension Reduction
 # ==============================================================================
-class DescriptorAggregator(nn.Module):
-    """Wrapper combining feature aggregation with linear projection.
-    Applies the optimal transport aggregation followed by a linear layer
-    to reduce dimensionality to the desired output size.
-    Args:
-        output_dim: Final descriptor dimensionality
-        aggregator_config: Configuration for FeatureAggregationHead
-        aggregator_output_dim: Output dimension of the aggregation head
-    """
-    def __init__(self, output_dim: int, aggregator_config: dict, aggregator_output_dim: int):
         super().__init__()
-        self.aggregation = FeatureAggregationHead(**aggregator_config)
-        self.projection = nn.Linear(aggregator_output_dim, output_dim)
     def forward(self, x):
-        aggregated = self.aggregation(x)
-        return self.projection(aggregated)
-# ==============================================================================
-# L2 Normalization Layer
-# ==============================================================================
-class L2Normalize(nn.Module):
-    """L2 normalization layer."""
-    def __init__(self, dim: int = -1):
         super().__init__()
-        self.dim = dim
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return F.normalize(x, p=2, dim=self.dim)
-# ==============================================================================
-# Main Model
-# ==============================================================================
 class MegaLoc(nn.Module, PyTorchModelHubMixin):
@@ -604,10 +419,9 @@ class MegaLoc(nn.Module, PyTorchModelHubMixin):
         mlp_dim: Hidden dimension for MLPs (default: 512)
     Example:
-        >>> model = MegaLoc.from_pretrained("gberton/MegaLoc")
         >>> model.eval()
-        >>> image = torch.randn(1, 3, 322, 322)  # Will auto-resize to 322x322
-        >>> descriptor = model(image)  # [1, 8448]
     """
     def __init__(
@@ -620,25 +434,21 @@ class MegaLoc(nn.Module, PyTorchModelHubMixin):
     ):
         super().__init__()
-        self.backbone = VisionTransformerBackbone()
-        # Aggregator output: num_clusters * cluster_dim + token_dim
-        self.aggregator_output_dim = num_clusters * cluster_dim + token_dim
-        self.aggregator = DescriptorAggregator(
-            output_dim=feat_dim,
-            aggregator_config={
-                "input_channels": self.backbone.num_channels,
                 "num_clusters": num_clusters,
-                "cluster_channels": cluster_dim,
-                "global_token_dim": token_dim,
-                "hidden_dim": mlp_dim,
             },
-            aggregator_output_dim=self.aggregator_output_dim,
         )
         self.feat_dim = feat_dim
-        self.normalize = L2Normalize()
     def forward(self, images: torch.Tensor) -> torch.Tensor:
         """Extract global descriptor from images.
@@ -649,19 +459,11 @@ class MegaLoc(nn.Module, PyTorchModelHubMixin):
         Returns:
             L2-normalized descriptors [B, feat_dim]
         """
-        batch_size, channels, height, width = images.shape
-        # Ensure dimensions are multiples of 14 (ViT patch size)
-        if height % 14 != 0 or width % 14 != 0:
-            height = round(height / 14) * 14
-            width = round(width / 14) * 14
-            images = tfm.resize(images, [height, width], antialias=True)
-        # Extract backbone features
-        features = self.backbone(images)
-        # Aggregate into global descriptor
-        descriptor = self.aggregator(features)
-        # Final L2 normalization
-        return self.normalize(descriptor)

 from huggingface_hub import PyTorchModelHubMixin
+# Code adapted from OpenGlue, MIT license
+# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/optimal_transport.py
+def log_otp_solver(log_a, log_b, M, num_iters: int = 20, reg: float = 1.0) -> torch.Tensor:
+    r"""Sinkhorn matrix scaling algorithm for Differentiable Optimal Transport problem.
+    This function solves the optimization problem and returns the OT matrix for the given parameters.
     Args:
+        log_a : torch.Tensor
+            Source weights
+        log_b : torch.Tensor
+            Target weights
+        M : torch.Tensor
+            metric cost matrix
+        num_iters : int, default=100
+            The number of iterations.
+        reg : float, default=1.0
+            regularization value
     """
+    M = M / reg  # regularization
+    u, v = torch.zeros_like(log_a), torch.zeros_like(log_b)
+    for _ in range(num_iters):
+        u = log_a - torch.logsumexp(M + v.unsqueeze(1), dim=2).squeeze()
+        v = log_b - torch.logsumexp(M + u.unsqueeze(2), dim=1).squeeze()
+    return M + u.unsqueeze(2) + v.unsqueeze(1)
+# Code adapted from OpenGlue, MIT license
+# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/superglue.py
+def get_matching_probs(S, dustbin_score=1.0, num_iters=3, reg=1.0):
+    """sinkhorn"""
+    batch_size, m, n = S.size()
+    # augment scores matrix
+    S_aug = torch.empty(batch_size, m + 1, n, dtype=S.dtype, device=S.device)
+    S_aug[:, :m, :n] = S
+    S_aug[:, m, :] = dustbin_score
+    # prepare normalized source and target log-weights
+    norm = -torch.tensor(math.log(n + m), device=S.device)
+    log_a, log_b = norm.expand(m + 1).contiguous(), norm.expand(n).contiguous()
+    log_a[-1] = log_a[-1] + math.log(n - m)
+    log_a, log_b = log_a.expand(batch_size, -1), log_b.expand(batch_size, -1)
+    log_P = log_otp_solver(log_a, log_b, S_aug, num_iters=num_iters, reg=reg)
+    return log_P - norm
+class FeatureAggregator(nn.Module):
     """Optimal transport-based aggregation of local features into global descriptor.
+    This module aggregates local patch features into a compact global representation
+    using differentiable optimal transport.
     Args:
+        num_channels: Number of input feature channels (from backbone)
+        num_clusters: Number of cluster centers
+        cluster_dim: Dimensionality of cluster descriptors
+        token_dim: Dimensionality of global scene token
+        mlp_dim: Hidden dimension for MLPs
+        dropout: Dropout probability (0 to disable)
     """
     def __init__(
         self,
+        num_channels=1536,
+        num_clusters=64,
+        cluster_dim=128,
+        token_dim=256,
+        mlp_dim=512,
+        dropout=0.3,
     ) -> None:
         super().__init__()
+        self.num_channels = num_channels
         self.num_clusters = num_clusters
+        self.cluster_dim = cluster_dim
+        self.token_dim = token_dim
+        self.mlp_dim = mlp_dim
+        if dropout > 0:
+            dropout = nn.Dropout(dropout)
+        else:
+            dropout = nn.Identity()
+        # MLP for global scene token
+        self.token_features = nn.Sequential(
+            nn.Linear(self.num_channels, self.mlp_dim), nn.ReLU(), nn.Linear(self.mlp_dim, self.token_dim)
         )
+        # MLP for local features
+        self.cluster_features = nn.Sequential(
+            nn.Conv2d(self.num_channels, self.mlp_dim, 1),
+            dropout,
             nn.ReLU(),
+            nn.Conv2d(self.mlp_dim, self.cluster_dim, 1),
         )
+        # MLP for score matrix
+        self.score = nn.Sequential(
+            nn.Conv2d(self.num_channels, self.mlp_dim, 1),
+            dropout,
             nn.ReLU(),
+            nn.Conv2d(self.mlp_dim, self.num_clusters, 1),
         )
+        # Dustbin parameter
+        self.dust_bin = nn.Parameter(torch.tensor(1.0))
+    def forward(self, x):
+        """
         Args:
+            x: Tuple of (features, token)
+                features: [B, C, H, W] spatial feature map
+                token: [B, C] global CLS token
         Returns:
+            Global descriptor [B, num_clusters * cluster_dim + token_dim]
         """
+        x, t = x
+        f = self.cluster_features(x).flatten(2)
+        p = self.score(x).flatten(2)
+        t = self.token_features(t)
+        p = get_matching_probs(p, self.dust_bin, 3)
+        p = torch.exp(p)
+        p = p[:, :-1, :]
+        p = p.unsqueeze(1).repeat(1, self.cluster_dim, 1, 1)
+        f = f.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)
+        f = torch.cat(
+            [
+                F.normalize(t, p=2, dim=-1),
+                F.normalize((f * p).sum(dim=-1), p=2, dim=1).flatten(1),
+            ],
+            dim=-1,
+        )
+        return F.normalize(f, p=2, dim=-1)
 # ==============================================================================
 class PatchEmbedding(nn.Module):
     """Convert image patches to embeddings using a convolutional layer."""
+    def __init__(self, image_size: int = 518, patch_size: int = 14, in_channels: int = 3, embed_dim: int = 768):
         super().__init__()
         self.image_size = image_size
         self.patch_size = patch_size
         self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         x = x.flatten(2)
         x = x.transpose(1, 2)
         return x
     """Multi-head self-attention module."""
     def __init__(
+        self, dim: int, num_heads: int = 12, qkv_bias: bool = True, attn_drop: float = 0.0, proj_drop: float = 0.0
     ):
         super().__init__()
         self.num_heads = num_heads
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         attn = (q @ k.transpose(-2, -1)) * self.scale
         attn = attn.softmax(dim=-1)
         attn = self.attn_drop(attn)
         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
         x = self.proj(x)
         x = self.proj_drop(x)
 class MLP(nn.Module):
     """MLP module with GELU activation."""
+    def __init__(self, in_features: int, hidden_features: int = None, out_features: int = None, drop: float = 0.0):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
     ):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = MultiHeadAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
         self.ls1 = LayerScale(dim, init_value=init_values)
         self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = MLP(in_features=dim, hidden_features=int(dim * mlp_ratio), drop=drop)
         self.ls2 = LayerScale(dim, init_value=init_values)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
+class DINOv2(nn.Module):
     """DINOv2 Vision Transformer backbone for feature extraction.
     This implements a ViT-B/14 architecture compatible with DINOv2 weights.
     """
     def __init__(
         super().__init__()
         self.patch_size = patch_size
         self.embed_dim = embed_dim
+        self.num_channels = embed_dim
         self.patch_embed = PatchEmbedding(
+            image_size=image_size, patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim
         )
         self.interpolate_offset = 0.1
         self.interpolate_antialias = False
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         num_patches = (image_size // patch_size) ** 2
         self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
         self.blocks = nn.ModuleList(
             [
+                TransformerBlock(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias)
                 for _ in range(depth)
             ]
         )
         self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
     def interpolate_pos_encoding(self, x: torch.Tensor, w: int, h: int) -> torch.Tensor:
+        """Interpolate positional encoding for different input sizes."""
         previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
         if npatch == N and w == h:
             return self.pos_embed
         dim = x.shape[-1]
         w0 = w // self.patch_size
         h0 = h // self.patch_size
+        M = int(math.sqrt(N))
         sx = float(w0 + self.interpolate_offset) / M
         sy = float(h0 + self.interpolate_offset) / M
         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
     def forward(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Extract features from images.
             images: Input images [B, 3, H, W] where H, W are multiples of 14
         Returns:
+            Tuple of (patch_features [B, 768, H//14, W//14], cls_token [B, 768])
         """
+        B, _, H, W = images.shape
+        x = self.patch_embed(images)
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, W, H)
         for block in self.blocks:
             x = block(x)
         x = self.norm(x)
         cls_token = x[:, 0]
         patch_tokens = x[:, 1:]
+        patch_features = patch_tokens.reshape(B, H // self.patch_size, W // self.patch_size, self.embed_dim).permute(
+            0, 3, 1, 2
+        )
         return patch_features, cls_token
 # ==============================================================================
+# Main Model
 # ==============================================================================
+class L2Norm(nn.Module):
+    def __init__(self, dim=1):
         super().__init__()
+        self.dim = dim
     def forward(self, x):
+        return F.normalize(x, p=2.0, dim=self.dim)
+class Aggregator(nn.Module):
+    def __init__(self, feat_dim, agg_config, salad_out_dim):
         super().__init__()
+        self.agg = FeatureAggregator(**agg_config)
+        self.linear = nn.Linear(salad_out_dim, feat_dim)
+    def forward(self, x):
+        x = self.agg(x)
+        return self.linear(x)
 class MegaLoc(nn.Module, PyTorchModelHubMixin):
         mlp_dim: Hidden dimension for MLPs (default: 512)
     Example:
+        >>> model = torch.hub.load("gmberton/MegaLoc", "get_trained_model")
         >>> model.eval()
+        >>> descriptor = model(image)  # [B, 8448]
     """
     def __init__(
     ):
         super().__init__()
+        self.backbone = DINOv2()
+        self.salad_out_dim = num_clusters * cluster_dim + token_dim
+        self.aggregator = Aggregator(
+            feat_dim=feat_dim,
+            agg_config={
+                "num_channels": self.backbone.num_channels,
                 "num_clusters": num_clusters,
+                "cluster_dim": cluster_dim,
+                "token_dim": token_dim,
+                "mlp_dim": mlp_dim,
             },
+            salad_out_dim=self.salad_out_dim,
         )
         self.feat_dim = feat_dim
+        self.l2norm = L2Norm()
     def forward(self, images: torch.Tensor) -> torch.Tensor:
         """Extract global descriptor from images.
         Returns:
             L2-normalized descriptors [B, feat_dim]
         """
+        b, c, h, w = images.shape
+        if h % 14 != 0 or w % 14 != 0:
+            h = round(h / 14) * 14
+            w = round(w / 14) * 14
+            images = tfm.resize(images, [h, w], antialias=True)
+        features = self.aggregator(self.backbone(images))
+        features = self.l2norm(features)
+        return features

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d8716ac9959a86e00494f605a4be46aebed15694ab4ad77c27b91ada9ab51e4
-size 914577620

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4f9f2bcb60018f91eb6a8e061ed054fd55654e10c2569cf13841ea986ffb4f8
+size 914577436