nolan4
/

modernBERT-base-CLIP

Zero-Shot Image Classification

Model card Files Files and versions

modernBERT-base-CLIP / vision_encoder.py

nolan4's picture

initial commit

8a00d0d about 1 year ago

2.63 kB

	import torch
	import torch.nn as nn
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from transformers.image_utils import load_image


	class ideficsV3(nn.Module):
	def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"):
	super().__init__()

	# load smolVLM model from huggingface
	self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor
	smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32)

	# Extract the necessary modules
	self.vision_model = smolVLM.model.vision_model

	def forward(self, pixel_values):

	#################################################################

	# The error ValueError: too many values to unpack (expected 4) occurs because the pixel_values tensor you passed into the model has a shape of [1, 13, 3, 384, 384], while the vision transformer (ViT) expects an input shape of [batch_size, channels, height, width], i.e., a 4D tensor.
	# Your pixel_values tensor is 5D because it contains multiple patches, while the ViT expects a single image or batch of images.
	# You need to flatten the patch dimension (the second dimension, 13) into the batch dimension (1) before passing it to the vision transformer.

	# Flatten the patch dimension into the batch dimension
	batch_size, num_patches, channels, height, width = pixel_values.shape
	pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width)

	#################################################################

	# Run images through the vision transformer
	vision_outputs = self.vision_model(pixel_values)
	x = vision_outputs.last_hidden_state # shape := [batch_size * num_patches, 729, 1152]

	return x

	if __name__ == "__main__":

	# Instantiate truncated model
	device = "cuda" if torch.cuda.is_available() else "cpu"
	truncated_model = ideficsV3().to(device).eval()
	truncated_model.eval()

	image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg")
	image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")

	inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt")
	pixel_values = inputs1.pixel_values.to(model_dtype).to(device)

	# Pass pixel_values through your truncated model
	with torch.no_grad():
	outputs = truncated_model(pixel_values)

	print(outputs.shape) # Should be [batch_size, 2048] given the projection layer output.