init commit

Files changed (8) hide show

.gitattributes +2 -0
.gitignore +1 -0
README.md +3 -0
load_model.py +36 -0
model.py +161 -0
models/mixvpr.py +94 -0
models/salad.py +141 -0
weights/best_model_95.6.torch +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .torch filter=lfs diff=lfs merge=lfs -text
2	+ *.torch filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: mit
+---

load_model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from model import DINOv2FeatureExtractor
+import torch
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+MODEL_CHECKPOINT_PATH = './weights/best_model_95.6.torch'
+model = DINOv2FeatureExtractor(
+        model_type="vit_base_patch14_reg4_dinov2.lvd142m",
+        num_of_layers_to_unfreeze=0,
+        desc_dim=768,
+        aggregator_type="SALAD",
+    )
+print('loading model ... ')
+model_state_dict = torch.load(MODEL_CHECKPOINT_PATH, map_location=DEVICE)
+model.load_state_dict(model_state_dict)
+model = model.to(DEVICE)
+model.eval()
+print('loaded ....')
+# Move to device
+model.to(DEVICE)
+# Print some info about model weights
+num_params = sum(p.numel() for p in model.parameters())
+num_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print(f"Model total parameters: {num_params:,}")
+print(f"Model trainable parameters: {num_trainable:,}")
+print(model.aggregator_type)

model.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+import logging
+from types import SimpleNamespace as Namespace
+# Assuming these are in your project structure
+from models.salad import SALAD
+from models.mixvpr import MixVPR
+class DINOv2FeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        image_size=518,  # Default for DINOv2 models
+        model_type="vit_base_patch14_reg4_dinov2.lvd142m",
+        num_of_layers_to_unfreeze=1,
+        desc_dim=768,  # vit-base has 768-dim embeddings
+        aggregator_type="No",
+    ):
+        super().__init__()
+        # Initialize backbone with registers
+        self.backbone = timm.create_model(
+            model_type, pretrained=True, num_classes=0, img_size=image_size
+        )
+        # Store configuration parameters
+        self.model_type = model_type
+        self.num_channels = self.backbone.embed_dim
+        self.desc_dim = desc_dim
+        self.image_size = image_size
+        self.num_of_layers_to_unfreeze = num_of_layers_to_unfreeze
+        self.aggregator_type = aggregator_type
+        self.aggregator = None
+        if aggregator_type == "SALAD":
+            if "vit_small" in model_type:
+                self.aggregator = SALAD(
+                    num_channels=self.num_channels,
+                    num_clusters=24,
+                    cluster_dim=64,
+                    token_dim=512,
+                    dropout=0.3,
+                )
+                # Output: 512 + (24 * 64) = 2,048 dims
+                self.desc_dim = 512 + (24 * 64)
+            elif "vit_base" in model_type:
+                self.aggregator = SALAD(
+                    num_channels=self.num_channels,
+                    num_clusters=32,
+                    cluster_dim=64,
+                    token_dim=1024,
+                    dropout=0.3,
+                )
+                # Output: 1024 + (32 * 64) = 3,072 dims
+                self.desc_dim = 1024 + (32 * 64)
+            elif "vit_large" in model_type:
+                self.aggregator = SALAD(
+                    num_channels=self.num_channels,
+                    num_clusters=48,
+                    cluster_dim=64,
+                    token_dim=1024,
+                    dropout=0.3,
+                )
+                # Output: 1024 + (48 * 64) = 4,096 dims
+                self.desc_dim = 1024 + (48 * 64)
+        elif aggregator_type == "MixVPR":
+            patch_dim = image_size // 14
+            if "vit_small" in model_type:
+                out_dim = 2048
+            elif "vit_base" in model_type:
+                out_dim = 3072
+            elif "vit_large" in model_type:
+                out_dim = 4096
+            else:
+                # Default or error
+                out_dim = 4096
+            self.aggregator = MixVPR(
+                in_channels=self.num_channels,
+                in_h=patch_dim,
+                in_w=patch_dim,
+                out_channels=out_dim,
+            )
+            self.desc_dim = out_dim
+        # This should be called regardless of the aggregator type.
+        self._freeze_parameters()
+    def _freeze_parameters(self):
+        """
+        Freeze all parameters except the last N transformer blocks and norm layer.
+        """
+        # First freeze everything
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+        # Unfreeze the last N blocks
+        if self.num_of_layers_to_unfreeze > 0:
+            for block in self.backbone.blocks[
+                -self.num_of_layers_to_unfreeze :
+            ]:
+                for param in block.parameters():
+                    param.requires_grad = True
+        # Unfreeze norm layer
+        for param in self.backbone.norm.parameters():
+            param.requires_grad = True
+        # Count trainable parameters for backbone
+        def count_trainable_params(model):
+            return sum(p.numel() for p in model.parameters() if p.requires_grad)
+        logging.info(
+            f"Number of trainable parameters backbone: {count_trainable_params(self.backbone):,}"
+        )
+        # Count aggregator parameters if it exists
+        if self.aggregator is not None:
+            aggregator_params = count_trainable_params(self.aggregator)
+            logging.info(
+                f"Number of trainable parameters aggregator: {aggregator_params:,}"
+            )
+            logging.info(
+                f"Total trainable parameters: {count_trainable_params(self.backbone) + aggregator_params:,}"
+            )
+    def forward(self, x):
+        B, _, H, W = x.shape
+        x = self.backbone.forward_features(x)
+        # Consistent handling for register vs. non-register models
+        if self.aggregator_type in ["SALAD", "MixVPR"]:
+            # DINOv2 with registers has 4 register tokens + 1 CLS token
+            # Standard ViT has 1 CLS token
+            start_index = 5 if "reg" in self.model_type else 1
+            patch_tokens = x[:, start_index:]
+            # Reshape to (B, C, H, W) for aggregators
+            patch_tokens_map = patch_tokens.reshape(
+                (B, H // 14, W // 14, self.num_channels)
+            ).permute(0, 3, 1, 2)
+            if self.aggregator_type == "SALAD":
+                cls_token = x[:, 0]
+                return self.aggregator((patch_tokens_map, cls_token))
+            elif self.aggregator_type == "MixVPR":
+                return self.aggregator(patch_tokens_map)
+        # Default behavior: extract features from CLS pooling
+        features = self.backbone.forward_head(x, pre_logits=True)
+        # L2 normalization
+        return F.normalize(features, p=2, dim=-1)

models/mixvpr.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import numpy as np
+class FeatureMixerLayer(nn.Module):
+    def __init__(self, in_dim, mlp_ratio=1):
+        super().__init__()
+        self.mix = nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, int(in_dim * mlp_ratio)),
+            nn.ReLU(),
+            nn.Linear(int(in_dim * mlp_ratio), in_dim),
+        )
+        for m in self.modules():
+            if isinstance(m, (nn.Linear)):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(self, x):
+        return x + self.mix(x)
+class MixVPR(nn.Module):
+    def __init__(self,
+                 in_channels=1024,
+                 in_h=20,
+                 in_w=20,
+                 out_channels=512,
+                 mix_depth=1,
+                 mlp_ratio=1,
+                 out_rows=4,
+                 ) -> None:
+        super().__init__()
+        self.in_h = in_h # height of input feature maps
+        self.in_w = in_w # width of input feature maps
+        self.in_channels = in_channels # depth of input feature maps
+        self.out_channels = out_channels # depth wise projection dimension
+        self.out_rows = out_rows # row wise projection dimesion
+        self.mix_depth = mix_depth # L the number of stacked FeatureMixers
+        self.mlp_ratio = mlp_ratio # ratio of the mid projection layer in the mixer block
+        hw = in_h*in_w
+        self.mix = nn.Sequential(*[
+            FeatureMixerLayer(in_dim=hw, mlp_ratio=mlp_ratio)
+            for _ in range(self.mix_depth)
+        ])
+        self.channel_proj = nn.Linear(in_channels, out_channels)
+        self.row_proj = nn.Linear(hw, out_rows)
+    def forward(self, x):
+        x = x.flatten(2)
+        x = self.mix(x)
+        x = x.permute(0, 2, 1)
+        x = self.channel_proj(x)
+        x = x.permute(0, 2, 1)
+        x = self.row_proj(x)
+        x = F.normalize(x.flatten(1), p=2, dim=1)
+        return x
+# -------------------------------------------------------------------------------
+def print_nb_params(m):
+    model_parameters = filter(lambda p: p.requires_grad, m.parameters())
+    params = sum([np.prod(p.size()) for p in model_parameters])
+    print(f'Trainable parameters: {params/1e6:.3}M')
+def main():
+    x = torch.randn(1, 1024, 20, 20)
+    agg = MixVPR(
+        in_channels=1024,
+        in_h=20,
+        in_w=20,
+        out_channels=1024,
+        mix_depth=4,
+        mlp_ratio=1,
+        out_rows=4)
+    print_nb_params(agg)
+    output = agg(x)
+    print(output.shape)
+if __name__ == '__main__':
+    main()

models/salad.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import math
+import torch
+import torch.nn as nn
+# Code adapted from OpenGlue, MIT license
+# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/optimal_transport.py
+def log_otp_solver(log_a, log_b, M, num_iters: int = 20, reg: float = 1.0) -> torch.Tensor:
+    r"""Sinkhorn matrix scaling algorithm for Differentiable Optimal Transport problem.
+    This function solves the optimization problem and returns the OT matrix for the given parameters.
+    Args:
+        log_a : torch.Tensor
+            Source weights
+        log_b : torch.Tensor
+            Target weights
+        M : torch.Tensor
+            metric cost matrix
+        num_iters : int, default=100
+            The number of iterations.
+        reg : float, default=1.0
+            regularization value
+    """
+    M = M / reg  # regularization
+    u, v = torch.zeros_like(log_a), torch.zeros_like(log_b)
+    for _ in range(num_iters):
+        u = log_a - torch.logsumexp(M + v.unsqueeze(1), dim=2).squeeze()
+        v = log_b - torch.logsumexp(M + u.unsqueeze(2), dim=1).squeeze()
+    return M + u.unsqueeze(2) + v.unsqueeze(1)
+# Code adapted from OpenGlue, MIT license
+# https://github.com/ucuapps/OpenGlue/blob/main/models/superglue/superglue.py
+def get_matching_probs(S, dustbin_score = 1.0, num_iters=3, reg=1.0):
+    """sinkhorn"""
+    batch_size, m, n = S.size()
+    # augment scores matrix
+    S_aug = torch.empty(batch_size, m + 1, n, dtype=S.dtype, device=S.device)
+    S_aug[:, :m, :n] = S
+    S_aug[:, m, :] = dustbin_score
+    # prepare normalized source and target log-weights
+    norm = -torch.tensor(math.log(n + m), device=S.device)
+    log_a, log_b = norm.expand(m + 1).contiguous(), norm.expand(n).contiguous()
+    log_a[-1] = log_a[-1] + math.log(n-m)
+    log_a, log_b = log_a.expand(batch_size, -1), log_b.expand(batch_size, -1)
+    log_P = log_otp_solver(
+        log_a,
+        log_b,
+        S_aug,
+        num_iters=num_iters,
+        reg=reg
+    )
+    return log_P - norm
+class SALAD(nn.Module):
+    """
+    This class represents the Sinkhorn Algorithm for Locally Aggregated Descriptors (SALAD) model.
+    Attributes:
+        num_channels (int): The number of channels of the inputs (d).
+        num_clusters (int): The number of clusters in the model (m).
+        cluster_dim (int): The number of channels of the clusters (l).
+        token_dim (int): The dimension of the global scene token (g).
+        dropout (float): The dropout rate.
+    """
+    def __init__(self,
+            num_channels=1536,
+            num_clusters=64,
+            cluster_dim=128,
+            token_dim=256,
+            dropout=0.3,
+        ) -> None:
+        super().__init__()
+        self.num_channels = num_channels
+        self.num_clusters= num_clusters
+        self.cluster_dim = cluster_dim
+        self.token_dim = token_dim
+        if dropout > 0:
+            dropout = nn.Dropout(dropout)
+        else:
+            dropout = nn.Identity()
+        # MLP for global scene token g
+        self.token_features = nn.Sequential(
+            nn.Linear(self.num_channels, 512),
+            nn.ReLU(),
+            nn.Linear(512, self.token_dim)
+        )
+        # MLP for local features f_i
+        self.cluster_features = nn.Sequential(
+            nn.Conv2d(self.num_channels, 512, 1),
+            dropout,
+            nn.ReLU(),
+            nn.Conv2d(512, self.cluster_dim, 1)
+        )
+        # MLP for score matrix S
+        self.score = nn.Sequential(
+            nn.Conv2d(self.num_channels, 512, 1),
+            dropout,
+            nn.ReLU(),
+            nn.Conv2d(512, self.num_clusters, 1),
+        )
+        # Dustbin parameter z
+        self.dust_bin = nn.Parameter(torch.tensor(1.))
+    def forward(self, x):
+        """
+        x (tuple): A tuple containing two elements, f and t.
+            (torch.Tensor): The feature tensors (t_i) [B, C, H // 14, W // 14].
+            (torch.Tensor): The token tensor (t_{n+1}) [B, C].
+        Returns:
+            f (torch.Tensor): The global descriptor [B, m*l + g]
+        """
+        x, t = x # Extract features and token
+        f = self.cluster_features(x).flatten(2)
+        p = self.score(x).flatten(2)
+        t = self.token_features(t)
+        # Sinkhorn algorithm
+        p = get_matching_probs(p, self.dust_bin, 3)
+        p = torch.exp(p)
+        # discard the dustbin
+        p = p[:, :-1, :]
+        p = p.unsqueeze(1).repeat(1, self.cluster_dim, 1, 1)
+        f = f.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)
+        f = torch.cat([
+            nn.functional.normalize(t, p=2, dim=-1),
+            nn.functional.normalize((f * p).sum(dim=-1), p=2, dim=1).flatten(1)
+        ], dim=-1)
+        return nn.functional.normalize(f, p=2, dim=-1)

weights/best_model_95.6.torch ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6cea6330719dee2b63e70438a2addc7f85242737a5079d6b88af10f7794669b
+size 353426618