Integrate with Sentence Transformers v5.4

#7
by tomaarsen HF Staff - opened
README.md CHANGED
@@ -4,9 +4,12 @@ language:
4
  - en
5
  base_model:
6
  - openai/clip-vit-base-patch16
 
7
  tags:
 
8
  - multimodal-retrieval
9
  - embedding-model
 
10
  ---
11
 
12
  <h1 align="center">MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval</h1>
@@ -61,7 +64,52 @@ BGE-VL achieve state-of-the-art performance on four popular zero-shot composed i
61
 
62
  ## Model Usage
63
 
64
- ### 1. BGE-VL-CLIP Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  You can easily use BGE-VL-CLIP models based on ```transformers```
66
  ```python
67
  import torch
 
4
  - en
5
  base_model:
6
  - openai/clip-vit-base-patch16
7
+ library_name: sentence-transformers
8
  tags:
9
+ - sentence-transformers
10
  - multimodal-retrieval
11
  - embedding-model
12
+ pipeline_tag: sentence-similarity
13
  ---
14
 
15
  <h1 align="center">MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval</h1>
 
64
 
65
  ## Model Usage
66
 
67
+ ### Using Sentence Transformers
68
+
69
+ Install Sentence Transformers:
70
+ ```bash
71
+ pip install sentence_transformers[image]
72
+ ```
73
+
74
+ ```python
75
+ from sentence_transformers import SentenceTransformer
76
+
77
+ model = SentenceTransformer("BAAI/BGE-VL-base", trust_remote_code=True)
78
+
79
+ query_image = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_query.png"
80
+ candidate_1 = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_candi_1.png"
81
+ candidate_2 = "https://huggingface.co/BAAI/BGE-VL-base/resolve/main/assets/cir_candi_2.png"
82
+
83
+ # Encode text
84
+ text_embeddings = model.encode(["A dog sitting on a bench", "A cat sleeping on a couch"])
85
+ print(text_embeddings.shape)
86
+ # (2, 512)
87
+
88
+ # Encode images
89
+ image_embeddings = model.encode([query_image, candidate_1])
90
+ print(image_embeddings.shape)
91
+ # (2, 512)
92
+
93
+ # Compute similarities
94
+ similarities = model.similarity(text_embeddings, image_embeddings)
95
+ print(similarities)
96
+ # tensor([[0.1050, 0.0871],
97
+ # [0.0010, 0.0355]])
98
+
99
+ # Composed image retrieval: encode image+text query, compare with image candidates
100
+ query_embeddings = model.encode([{
101
+ "image": query_image,
102
+ "text": "Make the background dark, as if the camera has taken the photo at night",
103
+ }])
104
+ candidate_embeddings = model.encode([candidate_1, candidate_2])
105
+ scores = model.similarity(query_embeddings, candidate_embeddings)
106
+ print(scores)
107
+ # tensor([[0.2645, 0.1251]])
108
+ ```
109
+
110
+ You can pass string texts, images as PIL Images, local paths, URLs, or a combination of text and images (with a dictionary format) to the model's `encode` function. The model will automatically process the inputs and return the corresponding embeddings. You can then compute cosine similarities or perform retrieval tasks based on these embeddings.
111
+
112
+ ### Using transformers
113
  You can easily use BGE-VL-CLIP models based on ```transformers```
114
  ```python
115
  import torch
bge_vl_clip_transformer.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom Transformer module for Sentence Transformers to load BGE-VL-CLIP models.
2
+
3
+ BGE-VL-CLIP uses late fusion for multimodal inputs: text and image features are
4
+ projected separately and summed. This module subclasses Transformer to add support
5
+ for the ("image", "text") compound modality by summing the text and image projected
6
+ embeddings in the forward pass.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from sentence_transformers.base.modules.transformer import Transformer
12
+
13
+
14
+ class BGEVLCLIPTransformer(Transformer):
15
+ @classmethod
16
+ def load(cls, model_name_or_path, *, trust_remote_code=False, **kwargs):
17
+ # The custom modeling_MMRet_CLIP.py has a non-persistent position_ids buffer
18
+ # bug on transformers v5+. The standard CLIPModel loads these weights fine,
19
+ # so we always load the underlying model without trust_remote_code.
20
+ return super().load(model_name_or_path, trust_remote_code=False, **kwargs)
21
+
22
+ def forward(self, features, **kwargs):
23
+ modality = features.get("modality", "text")
24
+
25
+ if modality != ("image", "text"):
26
+ return super().forward(features, **kwargs)
27
+
28
+ # For ("image", "text") modality: run text and image through their respective
29
+ # forward paths, then sum the projected embeddings.
30
+ text_features = {**features, "modality": "text"}
31
+ image_features = {**features, "modality": "image"}
32
+
33
+ text_features = super().forward(text_features, **kwargs)
34
+ image_features = super().forward(image_features, **kwargs)
35
+
36
+ features[self.module_output_name] = (
37
+ text_features[self.module_output_name] + image_features[self.module_output_name]
38
+ )
39
+ return features
40
+
41
+ @property
42
+ def modalities(self):
43
+ return ["text", "image", ("image", "text")]
config_sentence_transformers.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "default_prompt_name": null,
3
+ "model_type": "SentenceTransformer",
4
+ "prompts": {},
5
+ "similarity_fn_name": "cosine"
6
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "bge_vl_clip_transformer.BGEVLCLIPTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Normalize",
12
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "get_text_features",
6
+ "method_output_name": "pooler_output"
7
+ },
8
+ "image": {
9
+ "method": "get_image_features",
10
+ "method_output_name": "pooler_output"
11
+ },
12
+ "image+text": {
13
+ "method": "get_text_features",
14
+ "method_output_name": "pooler_output"
15
+ }
16
+ },
17
+ "module_output_name": "sentence_embedding"
18
+ }