Integrate with Sentence Transformers v5.4

by tomaarsen HF Staff - opened Apr 8

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+130

-1

Files changed (5) hide show

README.md +49 -1
bge_vl_clip_transformer.py +43 -0
config_sentence_transformers.json +6 -0
modules.json +14 -0
sentence_bert_config.json +18 -0

README.md CHANGED Viewed

@@ -4,9 +4,12 @@ language:
 - en
 base_model:
 - openai/clip-vit-base-patch16
 tags:
 - multimodal-retrieval
 - embedding-model
 ---
 <h1 align="center">MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval</h1>
@@ -61,7 +64,52 @@ BGE-VL achieve state-of-the-art performance on four popular zero-shot composed i
 ## Model Usage
-### 1. BGE-VL-CLIP Models
 You can easily use BGE-VL-CLIP models based on ```transformers```
 ```python
 import torch

 - en
 base_model:
 - openai/clip-vit-base-patch16
+library_name: sentence-transformers
 tags:
+- sentence-transformers
 - multimodal-retrieval
 - embedding-model
+pipeline_tag: sentence-similarity
 ---
 <h1 align="center">MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval</h1>
 ## Model Usage
+### Using Sentence Transformers
+Install Sentence Transformers:
+```bash
+pip install sentence_transformers[image]
+```
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("BAAI/BGE-VL-base", trust_remote_code=True)
+query_image = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_query.png"
+candidate_1 = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_candi_1.png"
+candidate_2 = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_candi_2.png"
+# Encode text
+text_embeddings = model.encode(["A dog sitting on a bench", "A cat sleeping on a couch"])
+print(text_embeddings.shape)
+# (2, 512)
+# Encode images
+image_embeddings = model.encode([query_image, candidate_1])
+print(image_embeddings.shape)
+# (2, 512)
+# Compute similarities
+similarities = model.similarity(text_embeddings, image_embeddings)
+print(similarities)
+# tensor([[0.1050, 0.0871],
+#         [0.0010, 0.0355]])
+# Composed image retrieval: encode image+text query, compare with image candidates
+query_embeddings = model.encode([{
+    "image": query_image,
+    "text": "Make the background dark, as if the camera has taken the photo at night",
+}])
+candidate_embeddings = model.encode([candidate_1, candidate_2])
+scores = model.similarity(query_embeddings, candidate_embeddings)
+print(scores)
+# tensor([[0.2645, 0.1251]])
+```
+You can pass string texts, images as PIL Images, local paths, URLs, or a combination of text and images (with a dictionary format) to the model's `encode` function. The model will automatically process the inputs and return the corresponding embeddings. You can then compute cosine similarities or perform retrieval tasks based on these embeddings.
+### Using transformers
 You can easily use BGE-VL-CLIP models based on ```transformers```
 ```python
 import torch

bge_vl_clip_transformer.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Custom Transformer module for Sentence Transformers to load BGE-VL-CLIP models.
+BGE-VL-CLIP uses late fusion for multimodal inputs: text and image features are
+projected separately and summed. This module subclasses Transformer to add support
+for the ("image", "text") compound modality by summing the text and image projected
+embeddings in the forward pass.
+"""
+from __future__ import annotations
+from sentence_transformers.base.modules.transformer import Transformer
+class BGEVLCLIPTransformer(Transformer):
+    @classmethod
+    def load(cls, model_name_or_path, *, trust_remote_code=False, **kwargs):
+        # The custom modeling_MMRet_CLIP.py has a non-persistent position_ids buffer
+        # bug on transformers v5+. The standard CLIPModel loads these weights fine,
+        # so we always load the underlying model without trust_remote_code.
+        return super().load(model_name_or_path, trust_remote_code=False, **kwargs)
+    def forward(self, features, **kwargs):
+        modality = features.get("modality", "text")
+        if modality != ("image", "text"):
+            return super().forward(features, **kwargs)
+        # For ("image", "text") modality: run text and image through their respective
+        # forward paths, then sum the projected embeddings.
+        text_features = {**features, "modality": "text"}
+        image_features = {**features, "modality": "image"}
+        text_features = super().forward(text_features, **kwargs)
+        image_features = super().forward(image_features, **kwargs)
+        features[self.module_output_name] = (
+            text_features[self.module_output_name] + image_features[self.module_output_name]
+        )
+        return features
+    @property
+    def modalities(self):
+        return ["text", "image", ("image", "text")]

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "default_prompt_name": null,
+  "model_type": "SentenceTransformer",
+  "prompts": {},
+  "similarity_fn_name": "cosine"
+}

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "bge_vl_clip_transformer.BGEVLCLIPTransformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Normalize",
+    "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "transformer_task": "feature-extraction",
+    "modality_config": {
+        "text": {
+            "method": "get_text_features",
+            "method_output_name": "pooler_output"
+        },
+        "image": {
+            "method": "get_image_features",
+            "method_output_name": "pooler_output"
+        },
+        "image+text": {
+            "method": "get_text_features",
+            "method_output_name": "pooler_output"
+        }
+    },
+    "module_output_name": "sentence_embedding"
+}