Spaces:

fast-stager
/

clipick

Running

File size: 1,791 Bytes

import torch
import torch.nn as nn
import open_clip
from huggingface_hub import hf_hub_download


class RankingHead(nn.Module):
    """2-layer MLP head with dropout — matches training checkpoint layout:
    head.net.0  Linear(in_dim, hidden_dim)
    head.net.1  GELU
    head.net.2  Dropout
    head.net.3  Linear(hidden_dim, 1)
    """

    def __init__(self, in_dim, hidden_dim=256, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, x):
        return self.net(x)


class MobileCLIPRanker(nn.Module):
    def __init__(self, backbone_dim=768, head_hidden_dim=256, head_dropout=0.1):
        super().__init__()
        self.backbone_dim = backbone_dim

        print("Initializing MobileCLIP2-L14 backbone...")
        ckpt_path = hf_hub_download(
            repo_id="apple/MobileCLIP2-L-14",
            filename="mobileclip2_l14.pt",
        )
        model, _, _ = open_clip.create_model_and_transforms(
            "MobileCLIP2-L-14", pretrained=ckpt_path
        )
        self.backbone = model.visual

        self.backbone.eval()
        for p in self.backbone.parameters():
            p.requires_grad = False

        self.head = RankingHead(backbone_dim, head_hidden_dim, head_dropout)

    def train(self, mode=True):
        super().train(mode)
        self.backbone.eval()
        return self

    def forward(self, x, valid_lens=None):
        if x.dim() == 5:
            b, g, c, h, w = x.shape
            features = self.backbone(x.view(b * g, c, h, w))
            features = features.view(b, g, -1)
        else:
            features = x
        return self.head(features)