gijs commited on
Commit
f6fc423
·
verified ·
1 Parent(s): 5ec805e

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - en
5
+ library_name: sentence-transformers
6
+ pipeline_tag: feature-extraction
7
+ base_model: LCO-Embedding/LCO-Embedding-Omni-7B
8
+ tags:
9
+ - audio
10
+ - speech
11
+ - emotion
12
+ - clap
13
+ - contrastive
14
+ - voice
15
+ - sentence-transformers
16
+ ---
17
+
18
+ # VoiceCLAP-Large
19
+
20
+ Voice-text contrastive embedding model — the larger of the two anchors
21
+ released with [VoiceNet](https://huggingface.co/VoiceNet).
22
+
23
+ VoiceCLAP-Large is a **single-tower** model: a rank-16 LoRA finetune of
24
+ [LCO-Embedding-Omni-7B](https://huggingface.co/LCO-Embedding/LCO-Embedding-Omni-7B)
25
+ (Qwen2.5-Omni-Thinker-7B backbone with a sentence-transformer
26
+ last-token-pooling head) trained with the symmetric InfoNCE loss. The audio
27
+ and text embeddings are produced by the same backbone — the modality is
28
+ determined by what is fed in via the multimodal chat template.
29
+
30
+ | | |
31
+ | --- | --- |
32
+ | Architecture | single-tower Omni-Embedding (Qwen2.5-Omni-Thinker-7B + ST last-token-pool) |
33
+ | Adaptation | rank-16 LoRA (alpha 32, dropout 0.05), merged into the released weights |
34
+ | Joint embedding | 3 584-d, L2-normalised |
35
+ | Loss | symmetric InfoNCE (all-gather negatives) |
36
+ | Total parameters | ~7 B (full merged model) |
37
+ | Epochs | 1 |
38
+
39
+ ## Training data
40
+
41
+ Trained for **1 epoch** on the open `voiceclap_10` mixture used in the
42
+ VoiceNet paper:
43
+
44
+ - `emolia-balanced-5M-subset` (annotated subset of [Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset))
45
+ - `laions_got_talent_clean_with_captions`
46
+ - `majestrino-data`
47
+ - `synthetic_vocal_bursts`
48
+ - `improved_synthetic_vocal_bursts`
49
+ - `ears`
50
+
51
+ All clips are captioned with `MOSS-Audio-8B-Thinking`-derived dense
52
+ vocal-style captions covering emotions, talking-style attributes, and
53
+ demographics.
54
+
55
+ ## Standalone load example
56
+
57
+ The model uses the SentenceTransformer multimodal API — both
58
+ `sentence-transformers` and `transformers` are on PyPI; no other deps are
59
+ required.
60
+
61
+ ```python
62
+ from sentence_transformers import SentenceTransformer
63
+
64
+ model = SentenceTransformer("VoiceNet/voiceclap-large", trust_remote_code=True)
65
+
66
+ # Text embedding (3 584-d, L2-normalised)
67
+ text_emb = model.encode(["a calm and steady voice"])
68
+
69
+ # Audio embedding — pass a dict with raw samples + sampling rate.
70
+ import soundfile as sf
71
+ arr, sr = sf.read("clip.wav")
72
+ audio_emb = model.encode([{"array": arr, "sampling_rate": sr}])
73
+
74
+ # Cosine similarity (embeddings already L2-normalised)
75
+ print((audio_emb @ text_emb.T).item())
76
+ ```
77
+
78
+ For convenience the LoRA adapter is also shipped under `adapter/` so it can
79
+ be reapplied to other LCO-Embedding-Omni-7B forks; the merged
80
+ `model.safetensors` already contains it.
81
+
82
+ ## Citation
83
+
84
+ If you use this model, please cite the VoiceNet paper.
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "architectures": [
4
+ "Qwen2_5OmniThinkerForConditionalGeneration"
5
+ ],
6
+ "audio_config": {
7
+ "_attn_implementation_autoset": true,
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "attention_dropout": 0.0,
11
+ "d_model": 1280,
12
+ "dropout": 0.0,
13
+ "dtype": "bfloat16",
14
+ "encoder_attention_heads": 20,
15
+ "encoder_ffn_dim": 5120,
16
+ "encoder_layerdrop": 0.0,
17
+ "encoder_layers": 32,
18
+ "init_std": 0.02,
19
+ "initializer_range": 0.02,
20
+ "max_source_positions": 1500,
21
+ "model_type": "qwen2_5_omni_audio_encoder",
22
+ "n_window": 100,
23
+ "num_hidden_layers": 32,
24
+ "num_mel_bins": 128,
25
+ "output_dim": 3584,
26
+ "scale_embedding": false,
27
+ "tf_legacy_loss": false,
28
+ "use_bfloat16": false
29
+ },
30
+ "audio_end_token_id": 151648,
31
+ "audio_start_token_id": 151647,
32
+ "audio_token_index": 151646,
33
+ "bos_token_id": 151644,
34
+ "dtype": "bfloat16",
35
+ "eos_token_id": 151645,
36
+ "ignore_index": -100,
37
+ "image_token_index": 151655,
38
+ "init_std": 0.02,
39
+ "initializer_range": 0.02,
40
+ "model_type": "qwen2_5_omni_thinker",
41
+ "pad_token_id": 151643,
42
+ "position_id_per_seconds": 25,
43
+ "seconds_per_chunk": 2,
44
+ "text_config": {
45
+ "attention_dropout": 0.0,
46
+ "bos_token_id": null,
47
+ "dtype": "bfloat16",
48
+ "eos_token_id": null,
49
+ "hidden_act": "silu",
50
+ "hidden_size": 3584,
51
+ "init_std": 0.02,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 18944,
54
+ "layer_types": [
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention",
80
+ "full_attention",
81
+ "full_attention",
82
+ "full_attention"
83
+ ],
84
+ "max_position_embeddings": 32768,
85
+ "max_window_layers": 28,
86
+ "model_type": "qwen2_5_omni_text",
87
+ "num_attention_heads": 28,
88
+ "num_hidden_layers": 28,
89
+ "num_key_value_heads": 4,
90
+ "pad_token_id": null,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_parameters": {
93
+ "mrope_section": [
94
+ 16,
95
+ 24,
96
+ 24
97
+ ],
98
+ "rope_theta": 1000000.0,
99
+ "rope_type": "default",
100
+ "type": "default"
101
+ },
102
+ "sliding_window": null,
103
+ "use_cache": true,
104
+ "use_sliding_window": false,
105
+ "vocab_size": 152064
106
+ },
107
+ "tie_word_embeddings": false,
108
+ "transformers_version": "5.1.0",
109
+ "user_token_id": 872,
110
+ "video_token_index": 151656,
111
+ "vision_config": {
112
+ "_attn_implementation_autoset": true,
113
+ "depth": 32,
114
+ "dtype": "bfloat16",
115
+ "embed_dim": 1280,
116
+ "fullatt_block_indexes": [
117
+ 7,
118
+ 15,
119
+ 23,
120
+ 31
121
+ ],
122
+ "hidden_act": "silu",
123
+ "hidden_size": 1280,
124
+ "in_channels": 3,
125
+ "in_chans": 3,
126
+ "init_std": 0.02,
127
+ "initializer_range": 0.02,
128
+ "intermediate_size": 3420,
129
+ "model_type": "qwen2_5_omni_vision_encoder",
130
+ "num_heads": 16,
131
+ "out_hidden_size": 3584,
132
+ "patch_size": 14,
133
+ "spatial_merge_size": 2,
134
+ "spatial_patch_size": 14,
135
+ "temporal_patch_size": 2,
136
+ "tf_legacy_loss": false,
137
+ "tokens_per_second": 25,
138
+ "use_bfloat16": false,
139
+ "window_size": 112
140
+ },
141
+ "vision_end_token_id": 151653,
142
+ "vision_start_token_id": 151652,
143
+ "vision_token_id": 151654
144
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.4.1",
5
+ "transformers": "5.1.0"
6
+ },
7
+ "default_prompt_name": "default",
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {
10
+ "default": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
11
+ "document": "",
12
+ "query": ""
13
+ },
14
+ "similarity_fn_name": "cosine"
15
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151644,
4
+ "eos_token_id": 151645,
5
+ "pad_token_id": 151643,
6
+ "transformers_version": "5.1.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb1c7c101cd5c360775bde52742bc7369bb8906d1991c4ab3cbeab836d81bf0
3
+ size 18133945952
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
19
+ }
20
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 300,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "max_pixels": 12845056,
19
+ "merge_size": 2,
20
+ "min_pixels": 3136,
21
+ "n_fft": 400,
22
+ "n_samples": 4800000,
23
+ "nb_max_frames": 30000,
24
+ "padding_side": "right",
25
+ "padding_value": 0.0,
26
+ "patch_size": 14,
27
+ "processor_class": "Qwen2_5OmniProcessor",
28
+ "return_attention_mask": true,
29
+ "sampling_rate": 16000,
30
+ "temporal_patch_size": 2
31
+ }
processor_config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 300,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 128,
7
+ "hop_length": 160,
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_processor_type": "Qwen2VLImageProcessor",
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "max_pixels": 12845056,
20
+ "merge_size": 2,
21
+ "min_pixels": 3136,
22
+ "n_fft": 400,
23
+ "n_samples": 4800000,
24
+ "nb_max_frames": 30000,
25
+ "padding_side": "right",
26
+ "padding_value": 0.0,
27
+ "patch_size": 14,
28
+ "return_attention_mask": true,
29
+ "sampling_rate": 16000,
30
+ "temporal_patch_size": 2
31
+ },
32
+ "image_processor": {
33
+ "chunk_length": 300,
34
+ "data_format": "channels_first",
35
+ "dither": 0.0,
36
+ "do_convert_rgb": true,
37
+ "do_normalize": true,
38
+ "do_rescale": true,
39
+ "do_resize": true,
40
+ "feature_size": 128,
41
+ "hop_length": 160,
42
+ "image_mean": [
43
+ 0.48145466,
44
+ 0.4578275,
45
+ 0.40821073
46
+ ],
47
+ "image_processor_type": "Qwen2VLImageProcessorFast",
48
+ "image_std": [
49
+ 0.26862954,
50
+ 0.26130258,
51
+ 0.27577711
52
+ ],
53
+ "merge_size": 2,
54
+ "n_fft": 400,
55
+ "n_samples": 4800000,
56
+ "nb_max_frames": 30000,
57
+ "padding_side": "right",
58
+ "padding_value": 0.0,
59
+ "patch_size": 14,
60
+ "resample": 3,
61
+ "rescale_factor": 0.00392156862745098,
62
+ "return_attention_mask": true,
63
+ "sampling_rate": 16000,
64
+ "size": {
65
+ "longest_edge": 12845056,
66
+ "shortest_edge": 3136
67
+ },
68
+ "temporal_patch_size": 2
69
+ },
70
+ "processor_class": "Qwen2_5OmniProcessor",
71
+ "video_processor": {
72
+ "chunk_length": 300,
73
+ "data_format": "channels_first",
74
+ "default_to_square": true,
75
+ "dither": 0.0,
76
+ "do_convert_rgb": true,
77
+ "do_normalize": true,
78
+ "do_rescale": true,
79
+ "do_resize": true,
80
+ "do_sample_frames": false,
81
+ "feature_extractor_type": "WhisperFeatureExtractor",
82
+ "feature_size": 128,
83
+ "hop_length": 160,
84
+ "image_mean": [
85
+ 0.48145466,
86
+ 0.4578275,
87
+ 0.40821073
88
+ ],
89
+ "image_processor_type": "Qwen2VLImageProcessor",
90
+ "image_std": [
91
+ 0.26862954,
92
+ 0.26130258,
93
+ 0.27577711
94
+ ],
95
+ "max_frames": 768,
96
+ "merge_size": 2,
97
+ "min_frames": 4,
98
+ "n_fft": 400,
99
+ "n_samples": 4800000,
100
+ "nb_max_frames": 30000,
101
+ "padding_side": "right",
102
+ "padding_value": 0.0,
103
+ "patch_size": 14,
104
+ "resample": 3,
105
+ "rescale_factor": 0.00392156862745098,
106
+ "return_attention_mask": true,
107
+ "return_metadata": false,
108
+ "sampling_rate": 16000,
109
+ "size": {
110
+ "longest_edge": 12845056,
111
+ "shortest_edge": 3136
112
+ },
113
+ "temporal_patch_size": 2,
114
+ "video_processor_type": "Qwen2VLVideoProcessor"
115
+ }
116
+ }
sentence_bert_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "any-to-any",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": [
7
+ "hidden_states",
8
+ -1
9
+ ]
10
+ },
11
+ "image": {
12
+ "method": "forward",
13
+ "method_output_name": [
14
+ "hidden_states",
15
+ -1
16
+ ]
17
+ },
18
+ "audio": {
19
+ "method": "forward",
20
+ "method_output_name": [
21
+ "hidden_states",
22
+ -1
23
+ ]
24
+ },
25
+ "video": {
26
+ "method": "forward",
27
+ "method_output_name": [
28
+ "hidden_states",
29
+ -1
30
+ ]
31
+ },
32
+ "message": {
33
+ "method": "forward",
34
+ "method_output_name": [
35
+ "hidden_states",
36
+ -1
37
+ ],
38
+ "format": "structured"
39
+ }
40
+ },
41
+ "module_output_name": "token_embeddings",
42
+ "processing_kwargs": {
43
+ "chat_template": {
44
+ "chat_template": "sentence_transformers",
45
+ "add_generation_prompt": true
46
+ }
47
+ }
48
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c75e6d39795d574e5ec741e767ca690ea08a33bfa024ef2d372b4e4c72db191
3
+ size 11422137
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "audio_bos_token": "<|audio_bos|>",
4
+ "audio_eos_token": "<|audio_eos|>",
5
+ "audio_token": "<|AUDIO|>",
6
+ "backend": "tokenizers",
7
+ "bos_token": null,
8
+ "clean_up_tokenization_spaces": false,
9
+ "eos_token": "<|im_end|>",
10
+ "errors": "replace",
11
+ "image_token": "<|IMAGE|>",
12
+ "is_local": true,
13
+ "model_max_length": 32768,
14
+ "model_specific_special_tokens": {
15
+ "audio_bos_token": "<|audio_bos|>",
16
+ "audio_eos_token": "<|audio_eos|>",
17
+ "audio_token": "<|AUDIO|>",
18
+ "image_token": "<|IMAGE|>",
19
+ "video_token": "<|VIDEO|>",
20
+ "vision_bos_token": "<|vision_bos|>",
21
+ "vision_eos_token": "<|vision_eos|>"
22
+ },
23
+ "pad_token": "<|endoftext|>",
24
+ "processor_class": "Qwen2_5OmniProcessor",
25
+ "split_special_tokens": false,
26
+ "tokenizer_class": "TokenizersBackend",
27
+ "unk_token": null,
28
+ "video_token": "<|VIDEO|>",
29
+ "vision_bos_token": "<|vision_bos|>",
30
+ "vision_eos_token": "<|vision_eos|>"
31
+ }