Integrate with Transformers v5 and Sentence Transformers v5.4

by tomaarsen HF Staff - opened Apr 13

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+178

-19

Files changed (8) hide show

1_Pooling/config.json +5 -0
README.md +65 -3
chat_template.jinja +11 -16
config.json +4 -0
config_sentence_transformers.json +13 -0
modeling_eager_embed.py +30 -0
modules.json +20 -0
sentence_bert_config.json +30 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "embedding_dimension": 2560,
+    "pooling_mode": "lasttoken",
+    "include_prompt": true
+}

README.md CHANGED Viewed

@@ -4,6 +4,8 @@ license: apache-2.0
 base_model:
 - Qwen/Qwen3-VL-4B-Instruct
 pipeline_tag: visual-document-retrieval
 ---
 # Eager Embed V1
@@ -28,10 +30,67 @@ Compared to multi-vector (ColBERT-like) architectures, eager-embed-v1 offers a s
 ## How to Get Started with the Model
 Load the model and define a helper function to encode messages:
 ```python
 import torch
-from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 from transformers.utils.import_utils import is_flash_attn_2_available
 from qwen_vl_utils import process_vision_info
@@ -44,12 +103,13 @@ elif torch.backends.mps.is_available():
 DTYPE = torch.bfloat16
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-model = Qwen3VLForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     attn_implementation=(
         "flash_attention_2" if is_flash_attn_2_available() else None
     ),
-    dtype=DTYPE
 ).to(DEVICE).eval()
 # Function to Encode Message
@@ -87,6 +147,7 @@ sim1 = torch.cosine_similarity(encode_message(query), encode_message(text_1))
 sim2 = torch.cosine_similarity(encode_message(query), encode_message(text_2))
 print("Similarities:", sim1.item(), sim2.item())
 ```
 📈 Image Document Retrieval (Image, Chart, PDF)
@@ -103,6 +164,7 @@ sim1 = torch.cosine_similarity(encode_message(query), encode_message(image_1))
 sim2 = torch.cosine_similarity(encode_message(query), encode_message(image_2))
 print("Similarities:", sim1.item(), sim2.item())
 ```
 ## Training Details

 base_model:
 - Qwen/Qwen3-VL-4B-Instruct
 pipeline_tag: visual-document-retrieval
+tags:
+- sentence-transformers
 ---
 # Eager Embed V1
 ## How to Get Started with the Model
+### Using Sentence Transformers
+Install Sentence Transformers:
+```bash
+pip install sentence_transformers
+```
+```python
+import requests
+from io import BytesIO
+from PIL import Image
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("eagerworks/eager-embed-v1", trust_remote_code=True)
+# Multilingual text retrieval
+# `encode_query` automatically prepends the "Query: " prefix the model was trained on.
+queries = ["What is the capital city of Uruguay?"]
+documents = [
+    "Montevideo es la capital y la ciudad más poblada de la República Oriental del Uruguay, así como la capital del departamento homónimo",
+    "El río Uruguay es un río internacional que forma parte de la cuenca del Plata. Nace en Brasil, recorre unos 1.800 km y desemboca en el Río de la Plata",
+]
+query_embeddings = model.encode_query(queries)
+document_embeddings = model.encode_document(documents)
+print(query_embeddings.shape, document_embeddings.shape)
+# (1, 2560) (2, 2560)
+similarities = model.similarity(query_embeddings, document_embeddings)
+print(similarities)
+# tensor([[0.2907, 0.1573]])
+# Image document retrieval
+MAX_IMAGE_SIZE = 784
+def fetch_image(url):
+    img = Image.open(BytesIO(requests.get(url).content)).convert("RGB")
+    return img.resize((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE))
+queries = ["Where can we find the animal llama?"]
+documents = [
+    fetch_image("https://huggingface.co/Tevatron/dse-phi3-docmatix-v2/resolve/main/animal-llama.png"),
+    fetch_image("https://huggingface.co/Tevatron/dse-phi3-docmatix-v2/resolve/main/meta-llama.png"),
+]
+query_embeddings = model.encode_query(queries)
+document_embeddings = model.encode_document(documents)
+print(query_embeddings.shape, document_embeddings.shape)
+# (1, 2560) (2, 2560)
+similarities = model.similarity(query_embeddings, document_embeddings)
+print(similarities)
+# tensor([[0.2709, 0.0930]])
+```
+### Using transformers
 Load the model and define a helper function to encode messages:
 ```python
 import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
 from transformers.utils.import_utils import is_flash_attn_2_available
 from qwen_vl_utils import process_vision_info
 DTYPE = torch.bfloat16
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = AutoModelForImageTextToText.from_pretrained(
     MODEL_NAME,
     attn_implementation=(
         "flash_attention_2" if is_flash_attn_2_available() else None
     ),
+    dtype=DTYPE,
+    trust_remote_code=True,
 ).to(DEVICE).eval()
 # Function to Encode Message
 sim2 = torch.cosine_similarity(encode_message(query), encode_message(text_2))
 print("Similarities:", sim1.item(), sim2.item())
+# Similarities: 0.2907 0.1573
 ```
 📈 Image Document Retrieval (Image, Chart, PDF)
 sim2 = torch.cosine_similarity(encode_message(query), encode_message(image_2))
 print("Similarities:", sim1.item(), sim2.item())
+# Similarities: 0.2709 0.0929
 ```
 ## Training Details

chat_template.jinja CHANGED Viewed

@@ -18,26 +18,18 @@
         {{- tool | tojson }}
     {%- endfor %}
     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' }}
-        {%- if messages[0].content is string %}
-            {{- messages[0].content }}
-        {%- else %}
-            {%- for content in messages[0].content %}
-                {%- if 'text' in content %}
-                    {{- content.text }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- endif %}
 {%- endif %}
 {%- set image_count = namespace(value=0) %}
 {%- set video_count = namespace(value=0) %}
 {%- for message in messages %}
-    {%- if message.role == "user" %}
-        {{- '<|im_start|>' + message.role + '\n' }}
         {%- if message.content is string %}
             {{- message.content }}
         {%- else %}
@@ -118,3 +110,6 @@
 {%- if add_generation_prompt %}
     {{- '<|im_start|>assistant\n' }}
 {%- endif %}

         {{- tool | tojson }}
     {%- endfor %}
     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- endif %}
+{%- set sys_prefix = '' %}
+{%- if not tools and messages[0].role == 'system' %}
+    {%- set sys_prefix = messages[0].content[0].text %}
 {%- endif %}
 {%- set image_count = namespace(value=0) %}
 {%- set video_count = namespace(value=0) %}
 {%- for message in messages %}
+    {%- if message.role == "system" and not tools %}
+        {# system text is inlined into the user message below #}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + sys_prefix }}
         {%- if message.content is string %}
             {{- message.content }}
         {%- else %}
 {%- if add_generation_prompt %}
     {{- '<|im_start|>assistant\n' }}
 {%- endif %}
+{%- if add_embedding_token %}
+    {{- '<|endoftext|>' }}
+{%- endif %}

config.json CHANGED Viewed

@@ -2,6 +2,10 @@
   "architectures": [
     "Qwen3VLForConditionalGeneration"
   ],
   "dtype": "float32",
   "image_token_id": 151655,
   "model_type": "qwen3_vl",

   "architectures": [
     "Qwen3VLForConditionalGeneration"
   ],
+  "auto_map": {
+    "AutoModel": "modeling_eager_embed.EagerEmbedModel",
+    "AutoModelForImageTextToText": "modeling_eager_embed.EagerEmbedForConditionalGeneration"
+  },
   "dtype": "float32",
   "image_token_id": 151655,
   "model_type": "qwen3_vl",

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "__version__": {
+    "pytorch": "2.10.0+cu128",
+    "sentence_transformers": "5.4.0",
+    "transformers": "5.5.0"
+  },
+  "model_type": "SentenceTransformer",
+  "prompts": {
+    "query": "Query: ",
+    "document": ""
+  },
+  "similarity_fn_name": "cosine"
+}

modeling_eager_embed.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch.nn as nn
+from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLModel,
+)
+# The model was trained with transformers==4.57.1, where
+# `Qwen3VLForConditionalGeneration(...).hidden_states[-1]` was the pre-final-norm
+# state of the text decoder. In transformers 5.x that field is now the post-norm
+# `last_hidden_state`. Replacing the text model's final RMSNorm with a no-op
+# restores the representation the model was trained on.
+_NORM_KEY_PATTERN = r"^model\.language_model\.norm\.weight$"
+class EagerEmbedModel(Qwen3VLModel):
+    _keys_to_ignore_on_load_unexpected = [_NORM_KEY_PATTERN]
+    def __init__(self, config):
+        super().__init__(config)
+        self.language_model.norm = nn.Identity()
+class EagerEmbedForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    _keys_to_ignore_on_load_unexpected = [_NORM_KEY_PATTERN]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model.language_model.norm = nn.Identity()

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.base.modules.transformer.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "transformer_task": "feature-extraction",
+    "modality_config": {
+        "text": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        },
+        "image": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        },
+        "video": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        },
+        "message": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state",
+            "format": "structured"
+        }
+    },
+    "module_output_name": "token_embeddings",
+    "processing_kwargs": {
+        "chat_template": {
+            "add_generation_prompt": true,
+            "add_embedding_token": true
+        }
+    },
+    "unpad_inputs": false
+}