| | import torch |
| | import torch.nn as nn |
| | from transformers import AutoProcessor, AutoModelForVision2Seq |
| | from transformers.image_utils import load_image |
| |
|
| |
|
| | class ideficsV3(nn.Module): |
| | def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"): |
| | super().__init__() |
| |
|
| | |
| | self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor |
| | smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32) |
| |
|
| | |
| | self.vision_model = smolVLM.model.vision_model |
| |
|
| | def forward(self, pixel_values): |
| |
|
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | batch_size, num_patches, channels, height, width = pixel_values.shape |
| | pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width) |
| |
|
| | |
| |
|
| | |
| | vision_outputs = self.vision_model(pixel_values) |
| | x = vision_outputs.last_hidden_state |
| |
|
| | return x |
| | |
| | if __name__ == "__main__": |
| |
|
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | truncated_model = ideficsV3().to(device).eval() |
| | truncated_model.eval() |
| |
|
| | image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg") |
| | image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") |
| |
|
| | inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt") |
| | pixel_values = inputs1.pixel_values.to(model_dtype).to(device) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = truncated_model(pixel_values) |
| |
|
| | print(outputs.shape) |
| |
|
| |
|