kbtit25 commited on
Commit
d769c06
·
verified ·
1 Parent(s): 4d5c679

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen0.6bemo4-merge/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.msc ADDED
Binary file (2.71 kB). View file
 
.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1759148356
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ ---
6
+
7
+ ## Introduction
8
+ - Converted weight files for: [IndexTTS-2-vLLM](https://github.com/Ksuriuri/index-tts-vllm)
9
+ - Original project: [IndexTTS](https://github.com/index-tts/index-tts)
10
+ - Original model weights: [IndexTTS-2](https://modelscope.cn/models/IndexTeam/IndexTTS-2)
bigvgan/bigvgan_generator.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e95ba25972d3de0628d99cd156e9315a9c018899bf739988959ebe3544080ced
3
+ size 449228171
bigvgan/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 32,
5
+ "learning_rate": 0.0001,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.9999996,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [4,4,2,2,2,2],
12
+ "upsample_kernel_sizes": [8,8,4,4,4,4],
13
+ "upsample_initial_channel": 1536,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "use_tanh_at_final": false,
18
+ "use_bias_at_final": false,
19
+
20
+ "activation": "snakebeta",
21
+ "snake_logscale": true,
22
+
23
+ "use_cqtd_instead_of_mrd": true,
24
+ "cqtd_filters": 128,
25
+ "cqtd_max_filters": 1024,
26
+ "cqtd_filters_scale": 1,
27
+ "cqtd_dilations": [1, 2, 4],
28
+ "cqtd_hop_lengths": [512, 256, 256],
29
+ "cqtd_n_octaves": [9, 9, 9],
30
+ "cqtd_bins_per_octaves": [24, 36, 48],
31
+
32
+ "mpd_reshapes": [2, 3, 5, 7, 11],
33
+ "use_spectral_norm": false,
34
+ "discriminator_channel_mult": 1,
35
+
36
+ "use_multiscale_melloss": true,
37
+ "lambda_melloss": 15,
38
+
39
+ "clip_grad_norm": 500,
40
+
41
+ "segment_size": 65536,
42
+ "num_mels": 80,
43
+ "num_freq": 1025,
44
+ "n_fft": 1024,
45
+ "hop_size": 256,
46
+ "win_size": 1024,
47
+
48
+ "sampling_rate": 22050,
49
+
50
+ "fmin": 0,
51
+ "fmax": null,
52
+ "fmax_for_loss": null,
53
+
54
+ "normalize_volume": true,
55
+
56
+ "num_workers": 4,
57
+
58
+ "dist_config": {
59
+ "dist_backend": "nccl",
60
+ "dist_url": "tcp://localhost:54321",
61
+ "world_size": 1
62
+ }
63
+ }
bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2a5ce8090d32da3642cc4f81fdc996376bc6dd3f4cd5e3d165f71120d9f2bc8
3
+ size 475997
campplus/campplus_cn_common.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3388cf5fd3493c9ac9c69851d8e7a8badcfb4f3dc631020c4961371646d5ada8
3
+ size 28036335
config.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ bpe_model: bpe.model
3
+ sample_rate: 24000
4
+ squeeze: false
5
+ mel:
6
+ sample_rate: 24000
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ win_length: 1024
10
+ n_mels: 100
11
+ mel_fmin: 0
12
+ normalize: false
13
+
14
+ gpt:
15
+ model_dim: 1280
16
+ max_mel_tokens: 1815
17
+ max_text_tokens: 600
18
+ heads: 20
19
+ use_mel_codes_as_input: true
20
+ mel_length_compression: 1024
21
+ layers: 24
22
+ number_text_tokens: 12000
23
+ number_mel_codes: 8194
24
+ start_mel_token: 8192
25
+ stop_mel_token: 8193
26
+ start_text_token: 0
27
+ stop_text_token: 1
28
+ train_solo_embeddings: false
29
+ condition_type: "conformer_perceiver"
30
+ condition_module:
31
+ output_size: 512
32
+ linear_units: 2048
33
+ attention_heads: 8
34
+ num_blocks: 6
35
+ input_layer: "conv2d2"
36
+ perceiver_mult: 2
37
+ emo_condition_module:
38
+ output_size: 512
39
+ linear_units: 1024
40
+ attention_heads: 4
41
+ num_blocks: 4
42
+ input_layer: "conv2d2"
43
+ perceiver_mult: 2
44
+
45
+ semantic_codec:
46
+ codebook_size: 8192
47
+ hidden_size: 1024
48
+ codebook_dim: 8
49
+ vocos_dim: 384
50
+ vocos_intermediate_dim: 2048
51
+ vocos_num_layers: 12
52
+
53
+ s2mel:
54
+ preprocess_params:
55
+ sr: 22050
56
+ spect_params:
57
+ n_fft: 1024
58
+ win_length: 1024
59
+ hop_length: 256
60
+ n_mels: 80
61
+ fmin: 0
62
+ fmax: "None"
63
+
64
+ dit_type: "DiT"
65
+ reg_loss_type: "l1"
66
+ style_encoder:
67
+ dim: 192
68
+ length_regulator:
69
+ channels: 512
70
+ is_discrete: false
71
+ in_channels: 1024
72
+ content_codebook_size: 2048
73
+ sampling_ratios: [1, 1, 1, 1]
74
+ vector_quantize: false
75
+ n_codebooks: 1
76
+ quantizer_dropout: 0.0
77
+ f0_condition: false
78
+ n_f0_bins: 512
79
+ DiT:
80
+ hidden_dim: 512
81
+ num_heads: 8
82
+ depth: 13
83
+ class_dropout_prob: 0.1
84
+ block_size: 8192
85
+ in_channels: 80
86
+ style_condition: true
87
+ final_layer_type: 'wavenet'
88
+ target: 'mel'
89
+ content_dim: 512
90
+ content_codebook_size: 1024
91
+ content_type: 'discrete'
92
+ f0_condition: false
93
+ n_f0_bins: 512
94
+ content_codebooks: 1
95
+ is_causal: false
96
+ long_skip_connection: true
97
+ zero_prompt_speech_token: false
98
+ time_as_token: false
99
+ style_as_token: false
100
+ uvit_skip_connection: true
101
+ add_resblock_in_transformer: false
102
+ wavenet:
103
+ hidden_dim: 512
104
+ num_layers: 8
105
+ kernel_size: 5
106
+ dilation_rate: 1
107
+ p_dropout: 0.2
108
+ style_condition: true
109
+
110
+ gpt_checkpoint: gpt.pth
111
+ w2v_stat: wav2vec2bert_stats.pt
112
+ s2mel_checkpoint: s2mel.pth
113
+ emo_matrix: feat2.pt
114
+ spk_matrix: feat1.pt
115
+ emo_num: [3, 17, 2, 8, 4, 5, 10, 24]
116
+ qwen_emo_path: qwen0.6bemo4-merge/
117
+ vocoder:
118
+ type: "bigvgan"
119
+ name: "nvidia/bigvgan_v2_22khz_80band_256x"
120
+ version: 2.0
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"task":"text-to-speech"}
feat1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f219cb447d80216ba615666da2ff8d63ac544eee26657f3a7b278692bf7a67c4
3
+ size 57170
feat2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4292e96dee535aea9a6206e9a0c856dd578dde9212acdb16dd3ada4d12bf80
3
+ size 374866
gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baaaeb8b56328da81731dc540a85a7dee32eca9da28f174b05757cb651c602a4
3
+ size 3484663079
gpt/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2InferenceModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 8192,
8
+ "dtype": "float16",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 8193,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 2417,
16
+ "n_embd": 1280,
17
+ "n_head": 20,
18
+ "n_inner": null,
19
+ "n_layer": 24,
20
+ "n_positions": 1818,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "transformers_version": "4.56.1",
31
+ "use_cache": true,
32
+ "vocab_size": 8194
33
+ }
gpt/preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 128,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "Qwen2AudioProcessor",
12
+ "return_attention_mask": true,
13
+ "sampling_rate": 16000
14
+ }
gpt/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaf87cd7c2d88cb5baef402c039f400501a83171cf651914fcc4cae6b1b8723d
3
+ size 991248387
gpt/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 1024}
qwen0.6bemo4-merge/Modelfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ollama modelfile auto-generated by llamafactory
2
+
3
+ FROM .
4
+
5
+ TEMPLATE """{{ if .System }}System: {{ .System }}<|endoftext|>
6
+ {{ end }}{{ range .Messages }}{{ if eq .Role "user" }}Human: {{ .Content }}<|endoftext|>
7
+ Assistant:{{ else if eq .Role "assistant" }}{{ .Content }}<|endoftext|>
8
+ {{ end }}{{ end }}"""
9
+
10
+ PARAMETER stop "<|endoftext|>"
11
+ PARAMETER num_ctx 4096
qwen0.6bemo4-merge/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
qwen0.6bemo4-merge/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ 'System: ' + system_message + '<|endoftext|>' + '
2
+ ' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '<|endoftext|>' + '
3
+ Assistant:' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '
4
+ ' }}{% endif %}{% endfor %}
qwen0.6bemo4-merge/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.1",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
qwen0.6bemo4-merge/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.52.1"
6
+ }
qwen0.6bemo4-merge/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen0.6bemo4-merge/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11293257a8df593c154a8ecd5fc039f3076de35411e35f06d41b471e136f6641
3
+ size 1192135096
qwen0.6bemo4-merge/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
qwen0.6bemo4-merge/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
qwen0.6bemo4-merge/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|endoftext|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "padding_side": "left",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
qwen0.6bemo4-merge/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
s2mel.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae1bb12017cbb47e7a5ce537fc82f40b6b1deb71acdb9b8f25686f32714b636
3
+ size 1202198223
semantic_codec/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec947271175d8cad75ec37e83aa487e27c97a0f72a303393772da5ffa84bddf2
3
+ size 177183712
w2v-bert-2.0/README.md ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - af
5
+ - am
6
+ - ar
7
+ - as
8
+ - az
9
+ - be
10
+ - bn
11
+ - bs
12
+ - bg
13
+ - ca
14
+ - cs
15
+ - zh
16
+ - cy
17
+ - da
18
+ - de
19
+ - el
20
+ - en
21
+ - et
22
+ - fi
23
+ - fr
24
+ - or
25
+ - om
26
+ - ga
27
+ - gl
28
+ - gu
29
+ - ha
30
+ - he
31
+ - hi
32
+ - hr
33
+ - hu
34
+ - hy
35
+ - ig
36
+ - id
37
+ - is
38
+ - it
39
+ - jv
40
+ - ja
41
+ - kn
42
+ - ka
43
+ - kk
44
+ - mn
45
+ - km
46
+ - ky
47
+ - ko
48
+ - lo
49
+ - ln
50
+ - lt
51
+ - lb
52
+ - lg
53
+ - lv
54
+ - ml
55
+ - mr
56
+ - mk
57
+ - mt
58
+ - mi
59
+ - my
60
+ - nl
61
+ - nb
62
+ - ne
63
+ - ny
64
+ - oc
65
+ - pa
66
+ - ps
67
+ - fa
68
+ - pl
69
+ - pt
70
+ - ro
71
+ - ru
72
+ - sk
73
+ - sl
74
+ - sn
75
+ - sd
76
+ - so
77
+ - es
78
+ - sr
79
+ - sv
80
+ - sw
81
+ - ta
82
+ - te
83
+ - tg
84
+ - tl
85
+ - th
86
+ - tr
87
+ - uk
88
+ - ur
89
+ - uz
90
+ - vi
91
+ - wo
92
+ - xh
93
+ - yo
94
+ - ms
95
+ - zu
96
+ - ary
97
+ - arz
98
+ - yue
99
+ - kea
100
+ inference: false
101
+ ---
102
+ # W2v-BERT 2.0 speech encoder
103
+
104
+ We are open-sourcing our Conformer-based [W2v-BERT 2.0 speech encoder](#w2v-bert-20-speech-encoder) as described in Section 3.2.1 of the [paper](https://arxiv.org/pdf/2312.05187.pdf), which is at the core of our Seamless models.
105
+
106
+ This model was pre-trained on 4.5M hours of unlabeled audio data covering more than 143 languages. It requires finetuning to be used for downstream tasks such as Automatic Speech Recognition (ASR), or Audio Classification.
107
+
108
+ | Model Name | #params | checkpoint |
109
+ | ----------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
110
+ | W2v-BERT 2.0 | 600M | [checkpoint](https://huggingface.co/reach-vb/conformer-shaw/resolve/main/conformer_shaw.pt)
111
+
112
+ **This model and its training are supported by 🤗 Transformers, more on it in the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert).**
113
+
114
+
115
+ # 🤗 Transformers usage
116
+
117
+ This is a bare checkpoint without any modeling head, and thus requires finetuning to be used for downstream tasks such as ASR. You can however use it to extract audio embeddings from the top layer with this code snippet:
118
+
119
+ ```python
120
+ from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
121
+ import torch
122
+ from datasets import load_dataset
123
+
124
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
125
+ dataset = dataset.sort("id")
126
+ sampling_rate = dataset.features["audio"].sampling_rate
127
+
128
+ processor = AutoProcessor.from_pretrained("facebook/w2v-bert-2.0")
129
+ model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
130
+
131
+ # audio file is decoded on the fly
132
+ inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
133
+ with torch.no_grad():
134
+ outputs = model(**inputs)
135
+ ```
136
+
137
+ To learn more about the model use, refer to the following resources:
138
+ - [its docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert)
139
+ - [a blog post showing how to fine-tune it on Mongolian ASR](https://huggingface.co/blog/fine-tune-w2v2-bert)
140
+ - [a training script example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py)
141
+
142
+
143
+ # Seamless Communication usage
144
+
145
+ This model can be used in [Seamless Communication](https://github.com/facebookresearch/seamless_communication), where it was released.
146
+
147
+ Here's how to make a forward pass through the voice encoder, after having completed the [installation steps](https://github.com/facebookresearch/seamless_communication?tab=readme-ov-file#installation):
148
+
149
+ ```python
150
+ import torch
151
+
152
+ from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
153
+ from fairseq2.memory import MemoryBlock
154
+ from fairseq2.nn.padding import get_seqs_and_padding_mask
155
+ from pathlib import Path
156
+ from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
157
+
158
+
159
+ audio_wav_path, device, dtype = ...
160
+ audio_decoder = AudioDecoder(dtype=torch.float32, device=device)
161
+ fbank_converter = WaveformToFbankConverter(
162
+ num_mel_bins=80,
163
+ waveform_scale=2**15,
164
+ channel_last=True,
165
+ standardize=True,
166
+ device=device,
167
+ dtype=dtype,
168
+ )
169
+ collater = Collater(pad_value=1)
170
+
171
+ model = load_conformer_shaw_model("conformer_shaw", device=device, dtype=dtype)
172
+ model.eval()
173
+
174
+ with Path(audio_wav_path).open("rb") as fb:
175
+ block = MemoryBlock(fb.read())
176
+
177
+ decoded_audio = audio_decoder(block)
178
+ src = collater(fbank_converter(decoded_audio))["fbank"]
179
+ seqs, padding_mask = get_seqs_and_padding_mask(src)
180
+
181
+ with torch.inference_mode():
182
+ seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
183
+ seqs, padding_mask = model.encoder(seqs, padding_mask)
184
+ ```
w2v-bert-2.0/config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "adapter_act": "relu",
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": false,
8
+ "architectures": [
9
+ "Wav2Vec2BertModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 768,
14
+ "codevector_dim": 768,
15
+ "conformer_conv_dropout": 0.1,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_depthwise_kernel_size": 31,
18
+ "ctc_loss_reduction": "sum",
19
+ "ctc_zero_infinity": false,
20
+ "diversity_loss_weight": 0.1,
21
+ "eos_token_id": 2,
22
+ "feat_proj_dropout": 0.0,
23
+ "feat_quantizer_dropout": 0.0,
24
+ "feature_projection_input_dim": 160,
25
+ "final_dropout": 0.1,
26
+ "hidden_act": "swish",
27
+ "hidden_dropout": 0.0,
28
+ "hidden_size": 1024,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 4096,
31
+ "layer_norm_eps": 1e-05,
32
+ "layerdrop": 0.1,
33
+ "left_max_position_embeddings": 64,
34
+ "mask_feature_length": 10,
35
+ "mask_feature_min_masks": 0,
36
+ "mask_feature_prob": 0.0,
37
+ "mask_time_length": 10,
38
+ "mask_time_min_masks": 2,
39
+ "mask_time_prob": 0.05,
40
+ "max_source_positions": 5000,
41
+ "model_type": "wav2vec2-bert",
42
+ "num_adapter_layers": 1,
43
+ "num_attention_heads": 16,
44
+ "num_codevector_groups": 2,
45
+ "num_codevectors_per_group": 320,
46
+ "num_hidden_layers": 24,
47
+ "num_negatives": 100,
48
+ "output_hidden_size": 1024,
49
+ "pad_token_id": 0,
50
+ "position_embeddings_type": "relative_key",
51
+ "proj_codevector_dim": 768,
52
+ "right_max_position_embeddings": 8,
53
+ "rotary_embedding_base": 10000,
54
+ "tdnn_dilation": [
55
+ 1,
56
+ 2,
57
+ 3,
58
+ 1,
59
+ 1
60
+ ],
61
+ "tdnn_dim": [
62
+ 512,
63
+ 512,
64
+ 512,
65
+ 512,
66
+ 1500
67
+ ],
68
+ "tdnn_kernel": [
69
+ 5,
70
+ 3,
71
+ 3,
72
+ 1,
73
+ 1
74
+ ],
75
+ "torch_dtype": "float32",
76
+ "transformers_version": "4.37.0.dev0",
77
+ "use_intermediate_ffn_before_adapter": false,
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": null,
80
+ "xvector_output_dim": 512
81
+ }
w2v-bert-2.0/conformer_shaw.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8310b4270a5b499e92e20c859892dbf7429619347debb5f8feba79eb88f99b4f
3
+ size 2329131983
w2v-bert-2.0/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb890c9660ed6e3414b6812e27257b8ce5454365d5490d3ad581ea60b93be043
3
+ size 2322063736
w2v-bert-2.0/preprocessor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
+ "feature_size": 80,
4
+ "num_mel_bins": 80,
5
+ "padding_side": "right",
6
+ "padding_value": 1,
7
+ "processor_class": "Wav2Vec2BertProcessor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000,
10
+ "stride": 2
11
+ }
wav2vec2bert_stats.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c176c2b8850ab2e3ba828bbfa969deaf4566ce55db5f2687b8430b87526ad2
3
+ size 9343