Expand examples, remove trust_remote_code fully

by tomaarsen HF Staff - opened Apr 23

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+86

-54

Files changed (5) hide show

README.md +84 -40
config.json +0 -4
config_sentence_transformers.json +1 -1
modeling_lco_omni.py +0 -8
sentence_bert_config.json +1 -1

README.md CHANGED Viewed

@@ -26,9 +26,10 @@ Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `t
 ### Using Sentence Transformers
-Install Sentence Transformers:
 ```bash
-pip install "sentence_transformers[image]"
 ```
 ```python
@@ -37,50 +38,93 @@ from sentence_transformers import SentenceTransformer
 model = SentenceTransformer(
     "LCO-Embedding/LCO-Embedding-Omni-3B",
-    trust_remote_code=True,
-    model_kwargs={"dtype": torch.bfloat16},
 )
-# The same "Summarize the above <modality> in one word:" instruction used in
-# the paper is baked into the chat template, so encode() takes plain text or
-# multimodal dicts directly.
-texts = [
-    "The capital of France is Paris.",
-    "Paris is the capital city of France.",
-    "The Eiffel Tower is located in Paris.",
-    "Berlin is the capital of Germany.",
-]
-text_embeddings = model.encode(texts)
-print(text_embeddings.shape)
-# (4, 2048)
-text_similarities = model.similarity(text_embeddings, text_embeddings)
-print(text_similarities)
-# tensor([[1.0000, 0.9538, 0.6566, 0.5988],
-#         [0.9538, 1.0000, 0.7059, 0.5932],
-#         [0.6566, 0.7059, 1.0000, 0.4198],
-#         [0.5988, 0.5932, 0.4198, 1.0000]])
-# Encoding images (text, audio, and video also work, individually or combined using a dict input):
-image_embeddings = model.encode([
-    "path/to/image_1.png",
-    "path/to/image_2.png",
-])
-print(image_embeddings.shape)
-# (2, 2048)
-# Multimodal inputs can mix modalities via dicts (text + image + audio + video):
-queries = ["A diagram of the Qwen2.5-Omni architecture"]
 documents = [
-    {"image": "path/to/qwen_diagram.png"},
-    {"text": "Llama 4 architecture overview", "image": "path/to/llama_diagram.png"},
 ]
-query_embeddings = model.encode(queries)
 document_embeddings = model.encode(documents)
-similarities = model.similarity(query_embeddings, document_embeddings)
-print(similarities.shape)
-# torch.Size([1, 2])
 ```
 ### Using Transformers

 ### Using Sentence Transformers
+Install Sentence Transformers with the multimodal extras (for image, audio, and video support):
 ```bash
+pip install "sentence_transformers[image,audio,video]" "transformers>=5.6.0"
 ```
 ```python
 model = SentenceTransformer(
     "LCO-Embedding/LCO-Embedding-Omni-3B",
+    model_kwargs={
+        "torch_dtype": torch.bfloat16,
+        "attn_implementation": "flash_attention_2",  # pip install kernels; recommended but not mandatory
+    },
 )
+```
+The same "Summarize the above <modality> in one word:" instruction used in the paper is baked into the chat template, so `encode()` takes plain text, file paths, URLs, or multimodal dicts directly.
+#### Text Retrieval
+```python
+query = "What is the tallest mountain in the world?"
 documents = [
+    "Mount Everest is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. Its elevation of 8,848.86 metres was established by a joint Chinese-Nepali survey in 2020.",
+    "K2, at 8,611 metres above sea level, is the second-highest mountain on Earth, after Mount Everest. It lies in the Karakoram range on the China-Pakistan border.",
+    "Mount Kilimanjaro is a dormant volcano in Tanzania. It is the highest mountain in Africa, with its summit about 5,895 metres above sea level.",
 ]
+query_embedding = model.encode(query)
 document_embeddings = model.encode(documents)
+print(model.similarity(query_embedding, document_embeddings))
+# tensor([[0.6199, 0.5585, 0.5233]])
+```
+#### Image Retrieval
+```python
+query = "How many input modalities does Qwen2.5-Omni support?"
+documents = [
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/qwen2.5omni_hgf.png",
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/llama4_hgf.png",
+]
+query_embedding = model.encode(query)
+document_embeddings = model.encode(documents, batch_size=1)
+print(model.similarity(query_embedding, document_embeddings))
+# tensor([[0.4396, 0.3418]])
+```
+#### Audio Retrieval
+```python
+query = "A light piano piece"
+documents = [
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/joe_hisaishi_summer.mp3",
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/jay_chou_superman_cant_fly.mp3",
+]
+query_embedding = model.encode(query)
+document_embeddings = model.encode(documents, batch_size=1)
+print(model.similarity(query_embedding, document_embeddings))
+# tensor([[0.3809, 0.0858]])
+```
+#### Video Retrieval
+```python
+# For video on smaller GPUs, cap the processor up front:
+model[0].processing_kwargs.update({
+    "video": {"max_pixels": 64 * 28 * 28, "do_sample_frames": True, "fps": 1},
+})
+query = "How to cook Mapo Tofu?"
+documents = [
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/mapo_tofu.mp4",
+    "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/zhajiang_noodle.mp4",
+]
+query_embedding = model.encode(query)
+document_embeddings = model.encode(documents, batch_size=1)
+print(model.similarity(query_embedding, document_embeddings))
+# tensor([[0.6406, 0.5033]])
+```
+#### Multimodal Inputs
+To embed a document that combines multiple modalities, pass a dict with any combination of `"text"`, `"image"`, `"audio"`, and `"video"` keys instead of a single path or string:
+```python
+documents = [
+    {
+        "text": "A cooking tutorial for Mapo Tofu",
+        "video": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/mapo_tofu.mp4",
+    },
+    {
+        "image": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/qwen2.5omni_hgf.png",
+        "audio": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/joe_hisaishi_summer.mp3",
+    },
+]
+document_embeddings = model.encode(documents, batch_size=1)
 ```
 ### Using Transformers

config.json CHANGED Viewed

@@ -3,10 +3,6 @@
   "architectures": [
     "Qwen2_5OmniThinkerForConditionalGeneration"
   ],
-  "auto_map": {
-    "AutoConfig": "modeling_lco_omni.Qwen2_5OmniThinkerConfig",
-    "AutoModel": "modeling_lco_omni.Qwen2_5OmniThinkerForConditionalGeneration"
-  },
   "audio_config": {
     "_attn_implementation_autoset": true,
     "activation_dropout": 0.0,

   "architectures": [
     "Qwen2_5OmniThinkerForConditionalGeneration"
   ],
   "audio_config": {
     "_attn_implementation_autoset": true,
     "activation_dropout": 0.0,

config_sentence_transformers.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "__version__": {
     "pytorch": "2.10.0+cu128",
     "sentence_transformers": "5.4.0",
-    "transformers": "5.5.0.dev0"
   },
   "default_prompt_name": "default",
   "model_type": "SentenceTransformer",

   "__version__": {
     "pytorch": "2.10.0+cu128",
     "sentence_transformers": "5.4.0",
+    "transformers": "5.6.0"
   },
   "default_prompt_name": "default",
   "model_type": "SentenceTransformer",

modeling_lco_omni.py DELETED Viewed

@@ -1,8 +0,0 @@
-# Re-exported so `auto_map` in config.json can resolve the Thinker classes;
-# `qwen2_5_omni_thinker` is shipped by transformers but not in `AutoConfig`.
-from transformers import Qwen2_5OmniThinkerConfig, Qwen2_5OmniThinkerForConditionalGeneration
-__all__ = [
-    "Qwen2_5OmniThinkerConfig",
-    "Qwen2_5OmniThinkerForConditionalGeneration",
-]

sentence_bert_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "transformer_task": "feature-extraction",
     "modality_config": {
         "text": {
             "method": "forward",

 {
+    "transformer_task": "any-to-any",
     "modality_config": {
         "text": {
             "method": "forward",