VQ-VAE CLIP Vision
Calculate the CLIP score on the latent.
The AIMv2 was only used for training and discarded during inference.
Inference
def clip_score(text_feature, image_feature):
image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True)
text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
logits_per_image = image_feature @ text_feature.T
logits_per_image = scale.exp() * logits_per_image
probs = logits_per_image.softmax(dim=1)[0]
return probs.tolist()
def encode(pixel_values):
model = VQModel.from_pretrained('MeissonFlow/Meissonic', subfolder='vqvae')
y = model.encoder.conv_in(pixel_values)
for down_block in model.encoder.down_blocks:
y = down_block(y)
y = model.encoder.mid_block(y)
y = model.encoder.conv_norm_out(y)
return model.encoder.conv_act(y)
class CLIPVision:
...
def forward(self, pixel_values):
b, c, h, w = pixel_values.shape
assert w == 448 and h == 448
with torch.no_grad():
output = encode(pixel_values)
b, dim, _, _ = output.shape
output = output.permute(0, 3, 1, 2).contiguous()
vq_output = output.view(1, -1, dim)
output = self.final_layer(vq_output)
pooled = output[:, :1024, :].mean(dim=1)
return torch.cat([pooled.unsqueeze(1), output[:, :1024, :]], dim=1)
image_size = 448
model = CLIPVision()
clip_model = AutoModelForCausalLM.from_pretrained('qihoo360/fg-clip-base', trust_remote_code=True)
clip_tokenizer = AutoTokenizer.from_pretrained('qihoo360/fg-clip-base')
...
pooled = model(pixel_values)
clip_score(text_feature, pooled)
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support
Model tree for twodgirl/vq-clip-vision
Base model
apple/aimv2-large-patch14-448