intfloat
/

mmE5-mllama-11b-instruct

@@ -44,57 +44,64 @@ pip install -r requirements.txt
 Then you can enter the directory to run the following command.
 ```python
-from src.model import MMEBModel
-from src.arguments import ModelArguments
-from src.utils import load_processor
 import torch
-from transformers import HfArgumentParser, AutoProcessor
 from PIL import Image
-import numpy as np
-model_args = ModelArguments(
-    model_name='intfloat/mmE5-mllama-11b-instruct',
-    pooling='last',
-    normalize=True,
-    model_backbone='mllama')
-processor = load_processor(model_args)
-model = MMEBModel.load(model_args)
 model.eval()
-model = model.to('cuda', dtype=torch.bfloat16)
 # Image + Text -> Text
 inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[Image.open(
-    'figures/example.jpg')], return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-qry_output = model(qry=inputs)["qry_reps"]
 string = 'A cat and a dog'
-inputs = processor(text=string, return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-tgt_output = model(tgt=inputs)["tgt_reps"]
-print(string, '=', model.compute_similarity(qry_output, tgt_output))
 ## A cat and a dog = tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
 string = 'A cat and a tiger'
-inputs = processor(text=string, return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-tgt_output = model(tgt=inputs)["tgt_reps"]
-print(string, '=', model.compute_similarity(qry_output, tgt_output))
 ## A cat and a tiger = tensor([[0.3105]], device='cuda:0', dtype=torch.bfloat16)
 # Text -> Image
-inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-qry_output = model(qry=inputs)["qry_reps"]
 string = '<|image|><|begin_of_text|> Represent the given image.'
-inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-tgt_output = model(tgt=inputs)["tgt_reps"]
-print(string, '=', model.compute_similarity(qry_output, tgt_output))
 ## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-qry_output = model(qry=inputs)["qry_reps"]
 string = '<|image|><|begin_of_text|> Represent the given image.'
-inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt")
-inputs = {key: value.to('cuda') for key, value in inputs.items()}
-tgt_output = model(tgt=inputs)["tgt_reps"]
-print(string, '=', model.compute_similarity(qry_output, tgt_output))
 ## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
 ```
@@ -106,4 +113,4 @@ print(string, '=', model.compute_similarity(qry_output, tgt_output))
   journal={arXiv preprint arXiv:2502.08468},
   year={2025}
 }
-```

 Then you can enter the directory to run the following command.
 ```python
+from transformers import MllamaForConditionalGeneration, AutoProcessor
 import torch
 from PIL import Image
+# Pooling and Normalization
+def last_pooling(last_hidden_state, attention_mask, normalize=True):
+    sequence_lengths = attention_mask.sum(dim=1) - 1
+    batch_size = last_hidden_state.shape[0]
+    reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths]
+    if normalize:
+        reps = torch.nn.functional.normalize(reps, p=2, dim=-1)
+    return reps
+def compute_similarity(q_reps, p_reps):
+    return torch.matmul(q_reps, p_reps.transpose(0, 1))
+model_name = "intfloat/mmE5-mllama-11b-instruct"
+# Load Processor and Model
+processor = AutoProcessor.from_pretrained(model_name)
+model = MllamaForConditionalGeneration.from_pretrained(
+    model_name, torch_dtype=torch.bfloat16
+).to("cuda")
 model.eval()
 # Image + Text -> Text
 inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[Image.open(
+    'figures/example.jpg')], return_tensors="pt").to("cuda")
+qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = 'A cat and a dog'
+text_inputs = processor(text=string, return_tensors="pt").to("cuda")
+tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
+print(string, '=', compute_similarity(qry_output, tgt_output))
 ## A cat and a dog = tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
 string = 'A cat and a tiger'
+text_inputs = processor(text=string, return_tensors="pt").to("cuda")
+tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
+print(string, '=', compute_similarity(qry_output, tgt_output))
 ## A cat and a tiger = tensor([[0.3105]], device='cuda:0', dtype=torch.bfloat16)
 # Text -> Image
+inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt").to("cuda")
+qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = '<|image|><|begin_of_text|> Represent the given image.'
+tgt_inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt").to("cuda")
+tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
+print(string, '=', compute_similarity(qry_output, tgt_output))
 ## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
+inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt").to("cuda")
+qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = '<|image|><|begin_of_text|> Represent the given image.'
+tgt_inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt").to("cuda")
+tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
+print(string, '=', compute_similarity(qry_output, tgt_output))
 ## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
 ```
   journal={arXiv preprint arXiv:2502.08468},
   year={2025}
 }
+```

custom_st.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from io import BytesIO
+from typing import Any, Dict, Optional, List
+import torch
+from PIL import Image
+from transformers import AutoProcessor, MllamaForConditionalGeneration
+from sentence_transformers.models import Transformer as BaseTransformer
+class MultiModalTransformer(BaseTransformer):
+    def __init__(
+            self,
+            model_name_or_path: str,
+            cache_dir: Optional[str] = None,
+            tokenizer_args: Optional[Dict[str, Any]] = None,
+            **kwargs,
+    ):
+        super().__init__(model_name_or_path, **kwargs)
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        # Initialize processor
+        self.processor = AutoProcessor.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **tokenizer_args
+        )
+    def _load_model(
+            self,
+            model_name_or_path: str,
+            config,
+            cache_dir: str,
+            backend: str,
+            is_peft_model: bool,
+            **model_args,
+    ) -> None:
+        self.auto_model = MllamaForConditionalGeneration.from_pretrained(
+            model_name_or_path, torch_dtype=torch.bfloat16, cache_dir=cache_dir, **model_args
+        )
+    def forward(
+            self, features: Dict[str, torch.Tensor], **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        # Process inputs through the model
+        outputs = self.auto_model(
+            **features,
+            return_dict=True,
+            output_hidden_states=True,
+            **kwargs
+        )
+        # Apply last pooling and normalization
+        last_hidden_state = outputs.hidden_states[-1]
+        attention_mask = features["attention_mask"]
+        sentence_embedding = self._last_pooling(last_hidden_state, attention_mask)
+        features.update({"sentence_embedding": sentence_embedding})
+        return features
+    def _last_pooling(self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        """Apply last token pooling and L2 normalization"""
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_state.shape[0]
+        reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths]
+        return torch.nn.functional.normalize(reps, p=2, dim=-1)
+    def tokenize(self, texts: List[List[Dict]] | List[str]) -> Dict[str, torch.Tensor]:
+        def process_text_item(item):
+            if isinstance(item, str):
+                return item, []
+            text, images = "", []
+            for sub_item in item:
+                if sub_item["type"] == "text":
+                    text += sub_item["content"]
+                elif sub_item["type"] in ["image_bytes", "image_path"]:
+                    text += "<|image|>"
+                    if sub_item["type"] == "image_bytes":
+                        img = Image.open(BytesIO(sub_item["content"])).convert("RGB")
+                    else:
+                        img = Image.open(sub_item["content"]).convert("RGB")
+                    images.append(img)
+                else:
+                    raise ValueError(f"Unknown data type {sub_item['type']}")
+            return text, images
+        all_texts, all_images = [], []
+        for item in texts:
+            text, images = process_text_item(item)
+            all_texts.append(text)
+            all_images.extend(images)
+        # Process inputs through the processor
+        if all_images:
+            inputs = self.processor(
+                text=all_texts,
+                images=all_images,
+                padding="longest",
+                truncation=True,
+                max_length=self.max_seq_length,
+                return_tensors="pt"
+            )
+        else:
+            inputs = self.processor(
+                text=all_texts,
+                padding="longest",
+                truncation=True,
+                max_length=self.max_seq_length,
+                return_tensors="pt"
+            )
+        return inputs