cased / modeling_cased.py

Update model to latest code (#6)

cd16641 over 2 years ago

6.51 kB

	import os
	from typing import Callable, Optional

	import numpy as np
	import torch
	from transformers import CLIPModel, CLIPProcessor
	from transformers.modeling_utils import PreTrainedModel

	from .configuration_cased import CaSEDConfig
	from .retrieval_cased import RetrievalDatabase, download_retrieval_databases
	from .transforms_cased import default_vocabulary_transforms


	class CaSEDModel(PreTrainedModel):
	"""Transformers module for Category Search from External Databases (CaSED).

	Reference:
	- Conti et al. Vocabulary-free Image Classification. NeurIPS 2023.

	Args:
	config (CaSEDConfig): Configuration class for CaSED.
	"""

	config_class = CaSEDConfig

	def __init__(self, config: CaSEDConfig):
	super().__init__(config)

	# load CLIP
	model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
	self.vision_encoder = model.vision_model
	self.vision_proj = model.visual_projection
	self.language_encoder = model.text_model
	self.language_proj = model.text_projection
	self.logit_scale = model.logit_scale.exp()
	self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

	# set hparams
	self.hparams = {}
	self.hparams["alpha"] = config.alpha
	self.hparams["index_name"] = config.index_name
	self.hparams["retrieval_num_results"] = config.retrieval_num_results
	self.hparams["cache_dir"] = config.cache_dir

	# create cache dir
	os.makedirs(self.hparams["cache_dir"], exist_ok=True)

	# download data
	download_retrieval_databases(cache_dir=self.hparams["cache_dir"])

	# setup vocabulary
	self.vocabulary = RetrievalDatabase("cc12m", self.hparams["cache_dir"])
	self._vocab_transform = default_vocabulary_transforms()

	@property
	def vocab_transform(self) -> Callable:
	"""Get image preprocess transform.

	The getter wraps the transform in a map_reduce function and applies it to a list of images.
	If interested in the transform itself, use `self._vocab_transform`.
	"""
	vocab_transform = self._vocab_transform

	def vocabs_transforms(texts: list[str]) -> list[torch.Tensor]:
	return [vocab_transform(text) for text in texts]

	return vocabs_transforms

	def get_vocabulary(self, images_z: Optional[torch.Tensor] = None) -> list[list[str]]:
	"""Get the vocabulary for a batch of images.

	Args:
	images_z (torch.Tensor): Batch of image embeddings.
	"""
	num_samples = self.hparams["retrieval_num_results"]

	assert images_z is not None

	images_z = images_z / images_z.norm(dim=-1, keepdim=True)
	images_z = images_z.cpu().detach().numpy().tolist()

	if isinstance(images_z[0], float):
	images_z = [images_z]

	query = np.matrix(images_z).astype("float32")
	results = self.vocabulary.query(query, modality="text", num_samples=num_samples)

	vocabularies = [[r["caption"] for r in result] for result in results]
	return vocabularies

	def forward(self, images: dict, alpha: Optional[float] = None) -> torch.Tensor:
	"""Forward pass.

	Args:
	images (dict): Dictionary with the images. The expected keys are:
	- pixel_values (torch.Tensor): Pixel values of the images.
	alpha (Optional[float]): Alpha value for the interpolation.
	"""
	alpha = alpha or self.hparams["alpha"]

	# forward the images
	images["pixel_values"] = images["pixel_values"].to(self.device)
	images_z = self.vision_proj(self.vision_encoder(**images)[1])
	images_z = images_z / images_z.norm(dim=-1, keepdim=True)
	vocabularies = self.get_vocabulary(images_z=images_z)

	# encode unfiltered words
	unfiltered_words = sum(vocabularies, [])
	texts_z = self.processor(unfiltered_words, return_tensors="pt", padding=True)
	texts_z["input_ids"] = texts_z["input_ids"][:, :77].to(self.device)
	texts_z["attention_mask"] = texts_z["attention_mask"][:, :77].to(self.device)
	texts_z = self.language_encoder(**texts_z)[1]
	texts_z = self.language_proj(texts_z)
	texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)

	# generate a text embedding for each image from their unfiltered words
	unfiltered_words_per_image = [len(vocab) for vocab in vocabularies]
	texts_z = torch.split(texts_z, unfiltered_words_per_image)
	texts_z = torch.stack([text_z.mean(dim=0) for text_z in texts_z])
	texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)

	# filter the words and embed them
	vocabularies = self.vocab_transform(vocabularies)
	vocabularies = [vocab or ["object"] for vocab in vocabularies]
	words = sum(vocabularies, [])
	words_z = self.processor(words, return_tensors="pt", padding=True)
	words_z = {k: v.to(self.device) for k, v in words_z.items()}
	words_z = self.language_encoder(**words_z)[1]
	words_z = self.language_proj(words_z)
	words_z = words_z / words_z.norm(dim=-1, keepdim=True)

	# create a one-hot relation mask between images and words
	words_per_image = [len(vocab) for vocab in vocabularies]
	col_indices = torch.arange(sum(words_per_image))
	row_indices = torch.arange(len(images_z)).repeat_interleave(torch.tensor(words_per_image))
	mask = torch.zeros(len(images_z), sum(words_per_image), device=self.device)
	mask[row_indices, col_indices] = 1

	# get the image and text similarities
	images_z = images_z / images_z.norm(dim=-1, keepdim=True)
	texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)
	words_z = words_z / words_z.norm(dim=-1, keepdim=True)
	images_sim = self.logit_scale * images_z @ words_z.T
	texts_sim = self.logit_scale * texts_z @ words_z.T

	# mask unrelated words
	images_sim = torch.masked_fill(images_sim, mask == 0, float("-inf"))
	texts_sim = torch.masked_fill(texts_sim, mask == 0, float("-inf"))

	# get the image and text predictions
	images_p = images_sim.softmax(dim=-1)
	texts_p = texts_sim.softmax(dim=-1)

	# average the image and text predictions
	samples_p = alpha * images_p + (1 - alpha) * texts_p

	return {"scores": samples_p, "words": words, "vocabularies": vocabularies}