Expand examples, remove trust_remote_code fully

#2
by tomaarsen HF Staff - opened
README.md CHANGED
@@ -26,9 +26,10 @@ Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `t
26
 
27
  ### Using Sentence Transformers
28
 
29
- Install Sentence Transformers:
 
30
  ```bash
31
- pip install "sentence_transformers[image]"
32
  ```
33
 
34
  ```python
@@ -37,50 +38,93 @@ from sentence_transformers import SentenceTransformer
37
 
38
  model = SentenceTransformer(
39
  "LCO-Embedding/LCO-Embedding-Omni-3B",
40
- trust_remote_code=True,
41
- model_kwargs={"dtype": torch.bfloat16},
 
 
42
  )
 
43
 
44
- # The same "Summarize the above <modality> in one word:" instruction used in
45
- # the paper is baked into the chat template, so encode() takes plain text or
46
- # multimodal dicts directly.
47
- texts = [
48
- "The capital of France is Paris.",
49
- "Paris is the capital city of France.",
50
- "The Eiffel Tower is located in Paris.",
51
- "Berlin is the capital of Germany.",
52
- ]
53
- text_embeddings = model.encode(texts)
54
- print(text_embeddings.shape)
55
- # (4, 2048)
56
-
57
- text_similarities = model.similarity(text_embeddings, text_embeddings)
58
- print(text_similarities)
59
- # tensor([[1.0000, 0.9538, 0.6566, 0.5988],
60
- # [0.9538, 1.0000, 0.7059, 0.5932],
61
- # [0.6566, 0.7059, 1.0000, 0.4198],
62
- # [0.5988, 0.5932, 0.4198, 1.0000]])
63
-
64
- # Encoding images (text, audio, and video also work, individually or combined using a dict input):
65
- image_embeddings = model.encode([
66
- "path/to/image_1.png",
67
- "path/to/image_2.png",
68
- ])
69
- print(image_embeddings.shape)
70
- # (2, 2048)
71
-
72
- # Multimodal inputs can mix modalities via dicts (text + image + audio + video):
73
- queries = ["A diagram of the Qwen2.5-Omni architecture"]
74
  documents = [
75
- {"image": "path/to/qwen_diagram.png"},
76
- {"text": "Llama 4 architecture overview", "image": "path/to/llama_diagram.png"},
 
77
  ]
78
- query_embeddings = model.encode(queries)
 
79
  document_embeddings = model.encode(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- similarities = model.similarity(query_embeddings, document_embeddings)
82
- print(similarities.shape)
83
- # torch.Size([1, 2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ```
85
 
86
  ### Using Transformers
 
26
 
27
  ### Using Sentence Transformers
28
 
29
+ Install Sentence Transformers with the multimodal extras (for image, audio, and video support):
30
+
31
  ```bash
32
+ pip install "sentence_transformers[image,audio,video]" "transformers>=5.6.0"
33
  ```
34
 
35
  ```python
 
38
 
39
  model = SentenceTransformer(
40
  "LCO-Embedding/LCO-Embedding-Omni-3B",
41
+ model_kwargs={
42
+ "torch_dtype": torch.bfloat16,
43
+ "attn_implementation": "flash_attention_2", # pip install kernels; recommended but not mandatory
44
+ },
45
  )
46
+ ```
47
 
48
+ The same "Summarize the above <modality> in one word:" instruction used in the paper is baked into the chat template, so `encode()` takes plain text, file paths, URLs, or multimodal dicts directly.
49
+
50
+ #### Text Retrieval
51
+ ```python
52
+ query = "What is the tallest mountain in the world?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  documents = [
54
+ "Mount Everest is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. Its elevation of 8,848.86 metres was established by a joint Chinese-Nepali survey in 2020.",
55
+ "K2, at 8,611 metres above sea level, is the second-highest mountain on Earth, after Mount Everest. It lies in the Karakoram range on the China-Pakistan border.",
56
+ "Mount Kilimanjaro is a dormant volcano in Tanzania. It is the highest mountain in Africa, with its summit about 5,895 metres above sea level.",
57
  ]
58
+
59
+ query_embedding = model.encode(query)
60
  document_embeddings = model.encode(documents)
61
+ print(model.similarity(query_embedding, document_embeddings))
62
+ # tensor([[0.6199, 0.5585, 0.5233]])
63
+ ```
64
+
65
+ #### Image Retrieval
66
+ ```python
67
+ query = "How many input modalities does Qwen2.5-Omni support?"
68
+ documents = [
69
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/qwen2.5omni_hgf.png",
70
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/llama4_hgf.png",
71
+ ]
72
+
73
+ query_embedding = model.encode(query)
74
+ document_embeddings = model.encode(documents, batch_size=1)
75
+ print(model.similarity(query_embedding, document_embeddings))
76
+ # tensor([[0.4396, 0.3418]])
77
+ ```
78
 
79
+ #### Audio Retrieval
80
+ ```python
81
+ query = "A light piano piece"
82
+ documents = [
83
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/joe_hisaishi_summer.mp3",
84
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/jay_chou_superman_cant_fly.mp3",
85
+ ]
86
+
87
+ query_embedding = model.encode(query)
88
+ document_embeddings = model.encode(documents, batch_size=1)
89
+ print(model.similarity(query_embedding, document_embeddings))
90
+ # tensor([[0.3809, 0.0858]])
91
+ ```
92
+
93
+ #### Video Retrieval
94
+ ```python
95
+ # For video on smaller GPUs, cap the processor up front:
96
+ model[0].processing_kwargs.update({
97
+ "video": {"max_pixels": 64 * 28 * 28, "do_sample_frames": True, "fps": 1},
98
+ })
99
+
100
+ query = "How to cook Mapo Tofu?"
101
+ documents = [
102
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/mapo_tofu.mp4",
103
+ "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/zhajiang_noodle.mp4",
104
+ ]
105
+
106
+ query_embedding = model.encode(query)
107
+ document_embeddings = model.encode(documents, batch_size=1)
108
+ print(model.similarity(query_embedding, document_embeddings))
109
+ # tensor([[0.6406, 0.5033]])
110
+ ```
111
+
112
+ #### Multimodal Inputs
113
+
114
+ To embed a document that combines multiple modalities, pass a dict with any combination of `"text"`, `"image"`, `"audio"`, and `"video"` keys instead of a single path or string:
115
+
116
+ ```python
117
+ documents = [
118
+ {
119
+ "text": "A cooking tutorial for Mapo Tofu",
120
+ "video": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/mapo_tofu.mp4",
121
+ },
122
+ {
123
+ "image": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/qwen2.5omni_hgf.png",
124
+ "audio": "https://huggingface.co/Tevatron/OmniEmbed-v0.1/resolve/main/assets/joe_hisaishi_summer.mp3",
125
+ },
126
+ ]
127
+ document_embeddings = model.encode(documents, batch_size=1)
128
  ```
129
 
130
  ### Using Transformers
config.json CHANGED
@@ -3,10 +3,6 @@
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
6
- "auto_map": {
7
- "AutoConfig": "modeling_lco_omni.Qwen2_5OmniThinkerConfig",
8
- "AutoModel": "modeling_lco_omni.Qwen2_5OmniThinkerForConditionalGeneration"
9
- },
10
  "audio_config": {
11
  "_attn_implementation_autoset": true,
12
  "activation_dropout": 0.0,
 
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
 
 
 
 
6
  "audio_config": {
7
  "_attn_implementation_autoset": true,
8
  "activation_dropout": 0.0,
config_sentence_transformers.json CHANGED
@@ -2,7 +2,7 @@
2
  "__version__": {
3
  "pytorch": "2.10.0+cu128",
4
  "sentence_transformers": "5.4.0",
5
- "transformers": "5.5.0.dev0"
6
  },
7
  "default_prompt_name": "default",
8
  "model_type": "SentenceTransformer",
 
2
  "__version__": {
3
  "pytorch": "2.10.0+cu128",
4
  "sentence_transformers": "5.4.0",
5
+ "transformers": "5.6.0"
6
  },
7
  "default_prompt_name": "default",
8
  "model_type": "SentenceTransformer",
modeling_lco_omni.py DELETED
@@ -1,8 +0,0 @@
1
- # Re-exported so `auto_map` in config.json can resolve the Thinker classes;
2
- # `qwen2_5_omni_thinker` is shipped by transformers but not in `AutoConfig`.
3
- from transformers import Qwen2_5OmniThinkerConfig, Qwen2_5OmniThinkerForConditionalGeneration
4
-
5
- __all__ = [
6
- "Qwen2_5OmniThinkerConfig",
7
- "Qwen2_5OmniThinkerForConditionalGeneration",
8
- ]
 
 
 
 
 
 
 
 
 
sentence_bert_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "transformer_task": "feature-extraction",
3
  "modality_config": {
4
  "text": {
5
  "method": "forward",
 
1
  {
2
+ "transformer_task": "any-to-any",
3
  "modality_config": {
4
  "text": {
5
  "method": "forward",