VQ-VAE CLIP Vision

Calculate the CLIP score on the latent.

The AIMv2 was only used for training and discarded during inference.

Inference

def clip_score(text_feature, image_feature):
    image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True)
    text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
    logits_per_image = image_feature @ text_feature.T
    logits_per_image = scale.exp() * logits_per_image
    probs = logits_per_image.softmax(dim=1)[0]

    return probs.tolist()

def encode(pixel_values):
    model = VQModel.from_pretrained('MeissonFlow/Meissonic', subfolder='vqvae')
    y = model.encoder.conv_in(pixel_values)
    for down_block in model.encoder.down_blocks:
        y = down_block(y)
    y = model.encoder.mid_block(y)
    y = model.encoder.conv_norm_out(y)

    return model.encoder.conv_act(y)

class CLIPVision:
  ...
  def forward(self, pixel_values):
      b, c, h, w = pixel_values.shape
      assert w == 448 and h == 448

      with torch.no_grad():
          output = encode(pixel_values)
          b, dim, _, _ = output.shape
          output = output.permute(0, 3, 1, 2).contiguous()
          vq_output = output.view(1, -1, dim)
      output = self.final_layer(vq_output)
      pooled = output[:, :1024, :].mean(dim=1)

      return torch.cat([pooled.unsqueeze(1), output[:, :1024, :]], dim=1)

image_size = 448
model = CLIPVision()
clip_model = AutoModelForCausalLM.from_pretrained('qihoo360/fg-clip-base', trust_remote_code=True)
clip_tokenizer = AutoTokenizer.from_pretrained('qihoo360/fg-clip-base')
...
pooled = model(pixel_values)
clip_score(text_feature, pooled)
Downloads last month

-

Downloads are not tracked for this model. How to track
Safetensors
Model size
48.4M params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for twodgirl/vq-clip-vision

Finetuned
(5)
this model