niobures commited on
Commit
70d8017
·
verified ·
1 Parent(s): 84845a9

TADA (multi)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. multi/tada-1b/.gitattributes +35 -0
  2. multi/tada-1b/README.md +154 -0
  3. multi/tada-1b/config.json +49 -0
  4. multi/tada-1b/generation_config.json +6 -0
  5. multi/tada-1b/graphics/CER.png +0 -0
  6. multi/tada-1b/graphics/naturalness.png +0 -0
  7. multi/tada-1b/graphics/real-time.png +0 -0
  8. multi/tada-1b/graphics/speaker-sim.png +0 -0
  9. multi/tada-1b/issues.txt +14 -0
  10. multi/tada-1b/model.safetensors +3 -0
  11. multi/tada-1b/source.txt +1 -0
  12. multi/tada-3b-ml/.gitattributes +35 -0
  13. multi/tada-3b-ml/config.json +53 -0
  14. multi/tada-3b-ml/final-graphics-polished/.DS_Store +0 -0
  15. multi/tada-3b-ml/final-graphics-polished/CER.png +0 -0
  16. multi/tada-3b-ml/final-graphics-polished/MOS.png +0 -0
  17. multi/tada-3b-ml/final-graphics-polished/naturalness.png +0 -0
  18. multi/tada-3b-ml/final-graphics-polished/real-time.png +0 -0
  19. multi/tada-3b-ml/generation_config.json +10 -0
  20. multi/tada-3b-ml/languages.txt +10 -0
  21. multi/tada-3b-ml/model-00001-of-00002.safetensors +3 -0
  22. multi/tada-3b-ml/model-00002-of-00002.safetensors +3 -0
  23. multi/tada-3b-ml/model.safetensors.index.json +510 -0
  24. multi/tada-3b-ml/source.txt +1 -0
  25. multi/tada-codec/.gitattributes +35 -0
  26. multi/tada-codec/README.md +40 -0
  27. multi/tada-codec/aligner-ar/config.json +7 -0
  28. multi/tada-codec/aligner-ar/model.safetensors +3 -0
  29. multi/tada-codec/aligner-ch/config.json +7 -0
  30. multi/tada-codec/aligner-ch/model.safetensors +3 -0
  31. multi/tada-codec/aligner-de/config.json +7 -0
  32. multi/tada-codec/aligner-de/model.safetensors +3 -0
  33. multi/tada-codec/aligner-es/config.json +7 -0
  34. multi/tada-codec/aligner-es/model.safetensors +3 -0
  35. multi/tada-codec/aligner-fr/config.json +7 -0
  36. multi/tada-codec/aligner-fr/model.safetensors +3 -0
  37. multi/tada-codec/aligner-it/config.json +7 -0
  38. multi/tada-codec/aligner-it/model.safetensors +3 -0
  39. multi/tada-codec/aligner-ja/config.json +7 -0
  40. multi/tada-codec/aligner-ja/model.safetensors +3 -0
  41. multi/tada-codec/aligner-pl/config.json +7 -0
  42. multi/tada-codec/aligner-pl/model.safetensors +3 -0
  43. multi/tada-codec/aligner-pt/config.json +7 -0
  44. multi/tada-codec/aligner-pt/model.safetensors +3 -0
  45. multi/tada-codec/aligner/config.json +7 -0
  46. multi/tada-codec/aligner/model.safetensors +3 -0
  47. multi/tada-codec/decoder/config.json +7 -0
  48. multi/tada-codec/decoder/model.safetensors +3 -0
  49. multi/tada-codec/encoder/config.json +7 -0
  50. multi/tada-codec/encoder/model.safetensors +3 -0
multi/tada-1b/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
multi/tada-1b/README.md ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ tags:
6
+ - tts
7
+ - text-to-speech
8
+ - speech-language-model
9
+ arxiv: 2602.23068
10
+ ---
11
+
12
+ <h1 align="center">TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment</h1>
13
+
14
+ <p align="center">
15
+ <a href="https://arxiv.org/abs/2602.23068"><img src="https://img.shields.io/badge/arXiv-Paper-b31b1b.svg" alt="Paper"></a>
16
+ <a href="https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-pink" alt="Demo"></a>
17
+ <a href="https://huggingface.co/spaces/HumeAI/tada"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue" alt="Demo"></a>
18
+ <a href="https://huggingface.co/collections/HumeAI/tada"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collection-yellow" alt="Collection"></a>
19
+ <a href="https://pypi.org/project/hume-tada/"><img src="https://img.shields.io/badge/PyPI-hume--tada-3775A9.svg?logo=pypi&logoColor=white" alt="PyPI"></a>
20
+ <a href="https://www.hume.ai/blog/opensource-tada"><img src="https://img.shields.io/badge/Blog-Post-orange.svg" alt="Blog"></a>
21
+ <a href="https://github.com/HumeAI/tada/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License"></a>
22
+ </p>
23
+
24
+ <img width="2400" height="1260" alt="image" src="https://github.com/user-attachments/assets/800eb8c5-eb6f-4e03-b8f3-150055a6cdfc" />
25
+
26
+ <p align="center"><br/><em>A unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment.</em></p>
27
+
28
+ ---
29
+
30
+ # Text-Acoustic Dual-Alignment Large Language Model
31
+
32
+ TADA is a unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment. By leveraging a novel tokenizer and architectural design, TADA achieves high-fidelity synthesis and generation with a fraction of the computational overhead required by traditional models.
33
+
34
+ ⭐️ arxiv: https://arxiv.org/abs/2602.23068 \
35
+ ⭐️ demo1: https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo \
36
+ ⭐️ demo2: https://huggingface.co/spaces/HumeAI/tada \
37
+ ⭐️ github: https://github.com/HumeAI/tada \
38
+ ⭐️ blog post: https://www.hume.ai/blog/opensource-tada \
39
+
40
+ ## Key Features
41
+
42
+ - 1:1 Token Alignment: Unlike standard models, TADA’s tokenizer encodes audio into a sequence of vectors that perfectly matches the number of text tokens.
43
+ - Dynamic Duration Synthesis: As a TTS model, it generates the full speech segment for a text token in a single autoregressive step, regardless of length. This eliminates the need for fixed-frame-rate processing.
44
+ - Dual-Stream Generation: In speech-language modeling mode, it generates a text token and the speech for the preceding token simultaneously, maintaining the same context length and minimal overhead compared to text-only generation.
45
+ - Efficiency & Reliability: TADA delivers superior expressiveness and natural flow while significantly reducing the computational cost associated with fixed audio frame rates.
46
+
47
+ ## How It Works
48
+
49
+ ### The Tokenization Schema
50
+
51
+ TADA unifies modalities by ensuring that for every word or subword token, there is exactly one corresponding speech vector. This synchronized stream allows the model to "understand" the precise timing of speech relative to text.
52
+
53
+ ### Dynamic Autoregression
54
+
55
+ Most TTS models require a fixed number of steps to produce one second of audio (e.g., 50 frames per second). TADA breaks this constraint:
56
+
57
+ - Each autoregressive step covers one text token.
58
+ - The model dynamically determines the duration and prosody for that specific token.
59
+ - This results in a more natural flow and eliminates transcript hallucination.
60
+
61
+ ## Installation
62
+
63
+ From the github repo
64
+
65
+ ```bash
66
+ pip install git+https://github.com/HumeAI/tada.git
67
+ ```
68
+
69
+ From source
70
+
71
+ ```bash
72
+ pip install -e .
73
+ ```
74
+
75
+ ## Models
76
+
77
+ We provide several model checkpoints:
78
+
79
+ | Model | Base Model | HuggingFace Hub |
80
+ | ------- | ------------ | --------------------------------------------------------- |
81
+ | TADA-1B | Llama 3.2 1B | [`HumeAI/tada-1b`](https://huggingface.co/HumeAI/tada-1b) |
82
+ | TADA-3B-ml | Llama 3.2 3B | [`HumeAI/tada-3b-ml`](https://huggingface.co/HumeAI/tada-3b-ml) |
83
+
84
+ All models use the same encoder ([`HumeAI/tada-codec`](https://huggingface.co/HumeAI/tada-codec)) and can be loaded using the same API.
85
+
86
+ ## Evaluation
87
+
88
+ <table>
89
+ <tr>
90
+ <td><img src="https://huggingface.co/HumeAI/tada-1b/resolve/main/graphics/CER.png" alt="CER" height="200px"></td>
91
+ <td><img src="https://huggingface.co/HumeAI/tada-1b/resolve/main/graphics/real-time.png" alt="Speed" height="200px"></td>
92
+ </tr>
93
+
94
+ <tr>
95
+ <td><img src="https://huggingface.co/HumeAI/tada-1b/resolve/main/graphics/speaker-sim.png" alt="Speaker Similarity" height="200px"></td>
96
+ <td><img src="https://huggingface.co/HumeAI/tada-1b/resolve/main/graphics/naturalness.png" alt="MOS" height="200px"></td>
97
+ </tr>
98
+ </table>
99
+
100
+ ## Run Inferece
101
+
102
+ ### Text-to-speech
103
+
104
+ ```python
105
+ import torch
106
+ import torchaudio
107
+
108
+ from tada.modules.encoder import Encoder
109
+ from tada.modules.tada import TadaForCausalLM
110
+
111
+ device = "cuda"
112
+ encoder = Encoder.from_pretrained("HumeAI/tada-codec", subfolder="encoder").to(device)
113
+ model = TadaForCausalLM.from_pretrained("HumeAI/tada-1b").to(device)
114
+
115
+ audio, sample_rate = torchaudio.load("samples/ljspeech.wav")
116
+ audio = audio.to(device)
117
+ prompt_text = "The examination and testimony of the experts, enabled the commission to conclude that five shots may have been fired."
118
+ prompt = encoder(
119
+ audio, text=[prompt_text], sample_rate=sample_rate
120
+ )
121
+
122
+ output = model.generate(
123
+ prompt=prompt,
124
+ text="Please call Stella. Ask her to bring these things with her from the store.",
125
+ )
126
+ ```
127
+
128
+ ### Speech continuation
129
+
130
+ Provide `num_extra_steps` if you want to generate text+speech continuation of the prompt
131
+
132
+ ```python
133
+ output = model.generate(
134
+ prompt=prompt,
135
+ num_extra_steps=50
136
+ )
137
+ ```
138
+
139
+ ## 📚 Citation
140
+
141
+ If you use this project in your research, please cite our paper:
142
+
143
+ ```bibtex
144
+ @article{dang2026tada,
145
+ title={TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment},
146
+ author={Dang, Trung and Rao, Sharath and Gupta, Ananya and Gagne, Christopher and Tzirakis, Panagiotis and Baird, Alice and Cłapa, Jakub Piotr and Chin, Peter and Cowen, Alan},
147
+ journal={arXiv preprint arXiv:2602.23068},
148
+ year={2026}
149
+ }
150
+ ```
151
+
152
+ ## Contact
153
+
154
+ Hume AI is an empathic AI research company. We research the datasets, tools, and models needed to give empathy to AI models to serve human wellbeing. If you're interested in any of our product or research collaborations, please reach out to us at hello@hume.ai
multi/tada-1b/config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "acoustic_dim": 512,
3
+ "acoustic_from_nth_hidden_state": -1,
4
+ "acoustic_mean": 0.0,
5
+ "acoustic_std": 1.5,
6
+ "add_semantic_to_condition": 0.0,
7
+ "architectures": [
8
+ "TadaForCausalLM"
9
+ ],
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 128000,
13
+ "bottleneck_dim": null,
14
+ "context_window": 8,
15
+ "diffusion_head_type": "vibevoice",
16
+ "dist_type": "fixed",
17
+ "dtype": "bfloat16",
18
+ "eos_token_id": 128001,
19
+ "head_dim": 64,
20
+ "head_ffn_ratio": 4.0,
21
+ "head_layers": 6,
22
+ "hidden_act": "silu",
23
+ "hidden_size": 2048,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 8192,
26
+ "latent_dropout": 0.0,
27
+ "max_position_embeddings": 131072,
28
+ "mlp_bias": false,
29
+ "model_type": "llama",
30
+ "num_attention_heads": 32,
31
+ "num_hidden_layers": 16,
32
+ "num_key_value_heads": 8,
33
+ "num_time_classes": 256,
34
+ "pretraining_tp": 1,
35
+ "rms_norm_eps": 1e-05,
36
+ "rope_scaling": {
37
+ "factor": 32.0,
38
+ "high_freq_factor": 4.0,
39
+ "low_freq_factor": 1.0,
40
+ "original_max_position_embeddings": 8192,
41
+ "rope_type": "llama3"
42
+ },
43
+ "rope_theta": 500000.0,
44
+ "shift_acoustic": 5,
45
+ "tie_word_embeddings": true,
46
+ "transformers_version": "4.57.3",
47
+ "use_cache": true,
48
+ "vocab_size": 128256
49
+ }
multi/tada-1b/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": 128001,
5
+ "transformers_version": "4.57.3"
6
+ }
multi/tada-1b/graphics/CER.png ADDED
multi/tada-1b/graphics/naturalness.png ADDED
multi/tada-1b/graphics/real-time.png ADDED
multi/tada-1b/graphics/speaker-sim.png ADDED
multi/tada-1b/issues.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -------------------------------------------------------------
2
+ #2 Installation Video and Testing - Step by Step
3
+ -------------------------------------------------------------
4
+
5
+ [fahdmirzac] 12 Mar 2026
6
+
7
+ Hi,
8
+ Kudos on producing such a sublime model. I did a local installation and testing video :
9
+
10
+ TADA: This Free Speech Model Just Broke the Rules of TTS - Local Demo
11
+ https://www.youtube.com/watch?v=DgX43zSjnB0
12
+
13
+ Thanks and regards,
14
+ Fahd
multi/tada-1b/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32063b83702a6a9f7527c1541b4e9adb24433f0b3563f4bd8345aebbb8282c39
3
+ size 3922463244
multi/tada-1b/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/HumeAI/tada-1b
multi/tada-3b-ml/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
multi/tada-3b-ml/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "acoustic_dim": 512,
3
+ "acoustic_from_nth_hidden_state": -1,
4
+ "acoustic_mean": 0.0,
5
+ "acoustic_std": 1.5,
6
+ "add_semantic_to_condition": 0.0,
7
+ "architectures": [
8
+ "TadaForCausalLM"
9
+ ],
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 128000,
13
+ "bottleneck_dim": null,
14
+ "context_window": 8,
15
+ "diffusion_head_type": "vibevoice",
16
+ "dist_type": "fixed",
17
+ "dtype": "bfloat16",
18
+ "eos_token_id": [
19
+ 128001,
20
+ 128008,
21
+ 128009
22
+ ],
23
+ "head_dim": 128,
24
+ "head_ffn_ratio": 4.0,
25
+ "head_layers": 6,
26
+ "hidden_act": "silu",
27
+ "hidden_size": 3072,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 8192,
30
+ "latent_dropout": 0.0,
31
+ "max_position_embeddings": 131072,
32
+ "mlp_bias": false,
33
+ "model_type": "llama",
34
+ "num_attention_heads": 24,
35
+ "num_hidden_layers": 28,
36
+ "num_key_value_heads": 8,
37
+ "num_time_classes": 256,
38
+ "pretraining_tp": 1,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_scaling": {
41
+ "factor": 32.0,
42
+ "high_freq_factor": 4.0,
43
+ "low_freq_factor": 1.0,
44
+ "original_max_position_embeddings": 8192,
45
+ "rope_type": "llama3"
46
+ },
47
+ "rope_theta": 500000.0,
48
+ "shift_acoustic": 5,
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "4.57.3",
51
+ "use_cache": true,
52
+ "vocab_size": 128256
53
+ }
multi/tada-3b-ml/final-graphics-polished/.DS_Store ADDED
Binary file (6.15 kB). View file
 
multi/tada-3b-ml/final-graphics-polished/CER.png ADDED
multi/tada-3b-ml/final-graphics-polished/MOS.png ADDED
multi/tada-3b-ml/final-graphics-polished/naturalness.png ADDED
multi/tada-3b-ml/final-graphics-polished/real-time.png ADDED
multi/tada-3b-ml/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "transformers_version": "4.57.3"
10
+ }
multi/tada-3b-ml/languages.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ English
2
+ Japanese
3
+ German
4
+ French
5
+ Spanish
6
+ Chamorro
7
+ Arabic
8
+ Italian
9
+ Polish
10
+ Portuguese
multi/tada-3b-ml/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:639a29ca27fdfe5c0a5bb18d24b47a72a049310ccd6c2ff4ca768640e1766470
3
+ size 4965799096
multi/tada-3b-ml/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b2b31e1541d1703a562b48fc5872505518d264e9bcc829352cefac9ec284b4
3
+ size 3901011596
multi/tada-3b-ml/model.safetensors.index.json ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4225756258,
4
+ "total_size": 8866748612
5
+ },
6
+ "weight_map": {
7
+ "_decoder.decoder_proj.bias": "model-00002-of-00002.safetensors",
8
+ "_decoder.decoder_proj.weight": "model-00002-of-00002.safetensors",
9
+ "_decoder.local_attention_decoder.final_norm.bias": "model-00002-of-00002.safetensors",
10
+ "_decoder.local_attention_decoder.final_norm.weight": "model-00002-of-00002.safetensors",
11
+ "_decoder.local_attention_decoder.layers.0.ffn.0.bias": "model-00002-of-00002.safetensors",
12
+ "_decoder.local_attention_decoder.layers.0.ffn.0.weight": "model-00002-of-00002.safetensors",
13
+ "_decoder.local_attention_decoder.layers.0.ffn.3.bias": "model-00002-of-00002.safetensors",
14
+ "_decoder.local_attention_decoder.layers.0.ffn.3.weight": "model-00002-of-00002.safetensors",
15
+ "_decoder.local_attention_decoder.layers.0.norm.bias": "model-00002-of-00002.safetensors",
16
+ "_decoder.local_attention_decoder.layers.0.norm.weight": "model-00002-of-00002.safetensors",
17
+ "_decoder.local_attention_decoder.layers.0.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
18
+ "_decoder.local_attention_decoder.layers.0.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
19
+ "_decoder.local_attention_decoder.layers.0.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
20
+ "_decoder.local_attention_decoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
21
+ "_decoder.local_attention_decoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
22
+ "_decoder.local_attention_decoder.layers.0.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
23
+ "_decoder.local_attention_decoder.layers.0.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
24
+ "_decoder.local_attention_decoder.layers.0.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
25
+ "_decoder.local_attention_decoder.layers.1.ffn.0.bias": "model-00002-of-00002.safetensors",
26
+ "_decoder.local_attention_decoder.layers.1.ffn.0.weight": "model-00002-of-00002.safetensors",
27
+ "_decoder.local_attention_decoder.layers.1.ffn.3.bias": "model-00002-of-00002.safetensors",
28
+ "_decoder.local_attention_decoder.layers.1.ffn.3.weight": "model-00002-of-00002.safetensors",
29
+ "_decoder.local_attention_decoder.layers.1.norm.bias": "model-00002-of-00002.safetensors",
30
+ "_decoder.local_attention_decoder.layers.1.norm.weight": "model-00002-of-00002.safetensors",
31
+ "_decoder.local_attention_decoder.layers.1.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
32
+ "_decoder.local_attention_decoder.layers.1.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
33
+ "_decoder.local_attention_decoder.layers.1.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
34
+ "_decoder.local_attention_decoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
35
+ "_decoder.local_attention_decoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
36
+ "_decoder.local_attention_decoder.layers.1.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
37
+ "_decoder.local_attention_decoder.layers.1.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
38
+ "_decoder.local_attention_decoder.layers.1.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
39
+ "_decoder.local_attention_decoder.layers.2.ffn.0.bias": "model-00002-of-00002.safetensors",
40
+ "_decoder.local_attention_decoder.layers.2.ffn.0.weight": "model-00002-of-00002.safetensors",
41
+ "_decoder.local_attention_decoder.layers.2.ffn.3.bias": "model-00002-of-00002.safetensors",
42
+ "_decoder.local_attention_decoder.layers.2.ffn.3.weight": "model-00002-of-00002.safetensors",
43
+ "_decoder.local_attention_decoder.layers.2.norm.bias": "model-00002-of-00002.safetensors",
44
+ "_decoder.local_attention_decoder.layers.2.norm.weight": "model-00002-of-00002.safetensors",
45
+ "_decoder.local_attention_decoder.layers.2.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
46
+ "_decoder.local_attention_decoder.layers.2.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
47
+ "_decoder.local_attention_decoder.layers.2.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
48
+ "_decoder.local_attention_decoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
49
+ "_decoder.local_attention_decoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
50
+ "_decoder.local_attention_decoder.layers.2.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
51
+ "_decoder.local_attention_decoder.layers.2.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
52
+ "_decoder.local_attention_decoder.layers.2.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
53
+ "_decoder.local_attention_decoder.layers.3.ffn.0.bias": "model-00002-of-00002.safetensors",
54
+ "_decoder.local_attention_decoder.layers.3.ffn.0.weight": "model-00002-of-00002.safetensors",
55
+ "_decoder.local_attention_decoder.layers.3.ffn.3.bias": "model-00002-of-00002.safetensors",
56
+ "_decoder.local_attention_decoder.layers.3.ffn.3.weight": "model-00002-of-00002.safetensors",
57
+ "_decoder.local_attention_decoder.layers.3.norm.bias": "model-00002-of-00002.safetensors",
58
+ "_decoder.local_attention_decoder.layers.3.norm.weight": "model-00002-of-00002.safetensors",
59
+ "_decoder.local_attention_decoder.layers.3.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
60
+ "_decoder.local_attention_decoder.layers.3.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
61
+ "_decoder.local_attention_decoder.layers.3.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
62
+ "_decoder.local_attention_decoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
63
+ "_decoder.local_attention_decoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
64
+ "_decoder.local_attention_decoder.layers.3.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
65
+ "_decoder.local_attention_decoder.layers.3.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
66
+ "_decoder.local_attention_decoder.layers.3.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
67
+ "_decoder.local_attention_decoder.layers.4.ffn.0.bias": "model-00002-of-00002.safetensors",
68
+ "_decoder.local_attention_decoder.layers.4.ffn.0.weight": "model-00002-of-00002.safetensors",
69
+ "_decoder.local_attention_decoder.layers.4.ffn.3.bias": "model-00002-of-00002.safetensors",
70
+ "_decoder.local_attention_decoder.layers.4.ffn.3.weight": "model-00002-of-00002.safetensors",
71
+ "_decoder.local_attention_decoder.layers.4.norm.bias": "model-00002-of-00002.safetensors",
72
+ "_decoder.local_attention_decoder.layers.4.norm.weight": "model-00002-of-00002.safetensors",
73
+ "_decoder.local_attention_decoder.layers.4.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
74
+ "_decoder.local_attention_decoder.layers.4.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
75
+ "_decoder.local_attention_decoder.layers.4.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
76
+ "_decoder.local_attention_decoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
77
+ "_decoder.local_attention_decoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
78
+ "_decoder.local_attention_decoder.layers.4.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
79
+ "_decoder.local_attention_decoder.layers.4.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
80
+ "_decoder.local_attention_decoder.layers.4.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
81
+ "_decoder.local_attention_decoder.layers.5.ffn.0.bias": "model-00002-of-00002.safetensors",
82
+ "_decoder.local_attention_decoder.layers.5.ffn.0.weight": "model-00002-of-00002.safetensors",
83
+ "_decoder.local_attention_decoder.layers.5.ffn.3.bias": "model-00002-of-00002.safetensors",
84
+ "_decoder.local_attention_decoder.layers.5.ffn.3.weight": "model-00002-of-00002.safetensors",
85
+ "_decoder.local_attention_decoder.layers.5.norm.bias": "model-00002-of-00002.safetensors",
86
+ "_decoder.local_attention_decoder.layers.5.norm.weight": "model-00002-of-00002.safetensors",
87
+ "_decoder.local_attention_decoder.layers.5.self_attn._precomputed_mask": "model-00002-of-00002.safetensors",
88
+ "_decoder.local_attention_decoder.layers.5.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors",
89
+ "_decoder.local_attention_decoder.layers.5.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors",
90
+ "_decoder.local_attention_decoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
91
+ "_decoder.local_attention_decoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
92
+ "_decoder.local_attention_decoder.layers.5.self_attn.qkv.bias": "model-00002-of-00002.safetensors",
93
+ "_decoder.local_attention_decoder.layers.5.self_attn.qkv.weight": "model-00002-of-00002.safetensors",
94
+ "_decoder.local_attention_decoder.layers.5.self_attn.rope_freqs": "model-00002-of-00002.safetensors",
95
+ "_decoder.wav_decoder.model.0.bias": "model-00002-of-00002.safetensors",
96
+ "_decoder.wav_decoder.model.0.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
97
+ "_decoder.wav_decoder.model.0.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
98
+ "_decoder.wav_decoder.model.1.block.0.alpha": "model-00002-of-00002.safetensors",
99
+ "_decoder.wav_decoder.model.1.block.1.bias": "model-00002-of-00002.safetensors",
100
+ "_decoder.wav_decoder.model.1.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
101
+ "_decoder.wav_decoder.model.1.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
102
+ "_decoder.wav_decoder.model.1.block.2.block.0.alpha": "model-00002-of-00002.safetensors",
103
+ "_decoder.wav_decoder.model.1.block.2.block.1.bias": "model-00002-of-00002.safetensors",
104
+ "_decoder.wav_decoder.model.1.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
105
+ "_decoder.wav_decoder.model.1.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
106
+ "_decoder.wav_decoder.model.1.block.2.block.2.alpha": "model-00002-of-00002.safetensors",
107
+ "_decoder.wav_decoder.model.1.block.2.block.3.bias": "model-00002-of-00002.safetensors",
108
+ "_decoder.wav_decoder.model.1.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
109
+ "_decoder.wav_decoder.model.1.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
110
+ "_decoder.wav_decoder.model.1.block.3.block.0.alpha": "model-00002-of-00002.safetensors",
111
+ "_decoder.wav_decoder.model.1.block.3.block.1.bias": "model-00002-of-00002.safetensors",
112
+ "_decoder.wav_decoder.model.1.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
113
+ "_decoder.wav_decoder.model.1.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
114
+ "_decoder.wav_decoder.model.1.block.3.block.2.alpha": "model-00002-of-00002.safetensors",
115
+ "_decoder.wav_decoder.model.1.block.3.block.3.bias": "model-00002-of-00002.safetensors",
116
+ "_decoder.wav_decoder.model.1.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
117
+ "_decoder.wav_decoder.model.1.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
118
+ "_decoder.wav_decoder.model.1.block.4.block.0.alpha": "model-00002-of-00002.safetensors",
119
+ "_decoder.wav_decoder.model.1.block.4.block.1.bias": "model-00002-of-00002.safetensors",
120
+ "_decoder.wav_decoder.model.1.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
121
+ "_decoder.wav_decoder.model.1.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
122
+ "_decoder.wav_decoder.model.1.block.4.block.2.alpha": "model-00002-of-00002.safetensors",
123
+ "_decoder.wav_decoder.model.1.block.4.block.3.bias": "model-00002-of-00002.safetensors",
124
+ "_decoder.wav_decoder.model.1.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
125
+ "_decoder.wav_decoder.model.1.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
126
+ "_decoder.wav_decoder.model.2.block.0.alpha": "model-00002-of-00002.safetensors",
127
+ "_decoder.wav_decoder.model.2.block.1.bias": "model-00002-of-00002.safetensors",
128
+ "_decoder.wav_decoder.model.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
129
+ "_decoder.wav_decoder.model.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
130
+ "_decoder.wav_decoder.model.2.block.2.block.0.alpha": "model-00002-of-00002.safetensors",
131
+ "_decoder.wav_decoder.model.2.block.2.block.1.bias": "model-00002-of-00002.safetensors",
132
+ "_decoder.wav_decoder.model.2.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
133
+ "_decoder.wav_decoder.model.2.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
134
+ "_decoder.wav_decoder.model.2.block.2.block.2.alpha": "model-00002-of-00002.safetensors",
135
+ "_decoder.wav_decoder.model.2.block.2.block.3.bias": "model-00002-of-00002.safetensors",
136
+ "_decoder.wav_decoder.model.2.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
137
+ "_decoder.wav_decoder.model.2.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
138
+ "_decoder.wav_decoder.model.2.block.3.block.0.alpha": "model-00002-of-00002.safetensors",
139
+ "_decoder.wav_decoder.model.2.block.3.block.1.bias": "model-00002-of-00002.safetensors",
140
+ "_decoder.wav_decoder.model.2.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
141
+ "_decoder.wav_decoder.model.2.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
142
+ "_decoder.wav_decoder.model.2.block.3.block.2.alpha": "model-00002-of-00002.safetensors",
143
+ "_decoder.wav_decoder.model.2.block.3.block.3.bias": "model-00002-of-00002.safetensors",
144
+ "_decoder.wav_decoder.model.2.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
145
+ "_decoder.wav_decoder.model.2.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
146
+ "_decoder.wav_decoder.model.2.block.4.block.0.alpha": "model-00002-of-00002.safetensors",
147
+ "_decoder.wav_decoder.model.2.block.4.block.1.bias": "model-00002-of-00002.safetensors",
148
+ "_decoder.wav_decoder.model.2.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
149
+ "_decoder.wav_decoder.model.2.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
150
+ "_decoder.wav_decoder.model.2.block.4.block.2.alpha": "model-00002-of-00002.safetensors",
151
+ "_decoder.wav_decoder.model.2.block.4.block.3.bias": "model-00002-of-00002.safetensors",
152
+ "_decoder.wav_decoder.model.2.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
153
+ "_decoder.wav_decoder.model.2.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
154
+ "_decoder.wav_decoder.model.3.block.0.alpha": "model-00002-of-00002.safetensors",
155
+ "_decoder.wav_decoder.model.3.block.1.bias": "model-00002-of-00002.safetensors",
156
+ "_decoder.wav_decoder.model.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
157
+ "_decoder.wav_decoder.model.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
158
+ "_decoder.wav_decoder.model.3.block.2.block.0.alpha": "model-00002-of-00002.safetensors",
159
+ "_decoder.wav_decoder.model.3.block.2.block.1.bias": "model-00002-of-00002.safetensors",
160
+ "_decoder.wav_decoder.model.3.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
161
+ "_decoder.wav_decoder.model.3.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
162
+ "_decoder.wav_decoder.model.3.block.2.block.2.alpha": "model-00002-of-00002.safetensors",
163
+ "_decoder.wav_decoder.model.3.block.2.block.3.bias": "model-00002-of-00002.safetensors",
164
+ "_decoder.wav_decoder.model.3.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
165
+ "_decoder.wav_decoder.model.3.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
166
+ "_decoder.wav_decoder.model.3.block.3.block.0.alpha": "model-00002-of-00002.safetensors",
167
+ "_decoder.wav_decoder.model.3.block.3.block.1.bias": "model-00002-of-00002.safetensors",
168
+ "_decoder.wav_decoder.model.3.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
169
+ "_decoder.wav_decoder.model.3.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
170
+ "_decoder.wav_decoder.model.3.block.3.block.2.alpha": "model-00002-of-00002.safetensors",
171
+ "_decoder.wav_decoder.model.3.block.3.block.3.bias": "model-00002-of-00002.safetensors",
172
+ "_decoder.wav_decoder.model.3.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
173
+ "_decoder.wav_decoder.model.3.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
174
+ "_decoder.wav_decoder.model.3.block.4.block.0.alpha": "model-00002-of-00002.safetensors",
175
+ "_decoder.wav_decoder.model.3.block.4.block.1.bias": "model-00002-of-00002.safetensors",
176
+ "_decoder.wav_decoder.model.3.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
177
+ "_decoder.wav_decoder.model.3.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
178
+ "_decoder.wav_decoder.model.3.block.4.block.2.alpha": "model-00002-of-00002.safetensors",
179
+ "_decoder.wav_decoder.model.3.block.4.block.3.bias": "model-00002-of-00002.safetensors",
180
+ "_decoder.wav_decoder.model.3.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
181
+ "_decoder.wav_decoder.model.3.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
182
+ "_decoder.wav_decoder.model.4.block.0.alpha": "model-00002-of-00002.safetensors",
183
+ "_decoder.wav_decoder.model.4.block.1.bias": "model-00002-of-00002.safetensors",
184
+ "_decoder.wav_decoder.model.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
185
+ "_decoder.wav_decoder.model.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
186
+ "_decoder.wav_decoder.model.4.block.2.block.0.alpha": "model-00002-of-00002.safetensors",
187
+ "_decoder.wav_decoder.model.4.block.2.block.1.bias": "model-00002-of-00002.safetensors",
188
+ "_decoder.wav_decoder.model.4.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
189
+ "_decoder.wav_decoder.model.4.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
190
+ "_decoder.wav_decoder.model.4.block.2.block.2.alpha": "model-00002-of-00002.safetensors",
191
+ "_decoder.wav_decoder.model.4.block.2.block.3.bias": "model-00002-of-00002.safetensors",
192
+ "_decoder.wav_decoder.model.4.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
193
+ "_decoder.wav_decoder.model.4.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
194
+ "_decoder.wav_decoder.model.4.block.3.block.0.alpha": "model-00002-of-00002.safetensors",
195
+ "_decoder.wav_decoder.model.4.block.3.block.1.bias": "model-00002-of-00002.safetensors",
196
+ "_decoder.wav_decoder.model.4.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
197
+ "_decoder.wav_decoder.model.4.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
198
+ "_decoder.wav_decoder.model.4.block.3.block.2.alpha": "model-00002-of-00002.safetensors",
199
+ "_decoder.wav_decoder.model.4.block.3.block.3.bias": "model-00002-of-00002.safetensors",
200
+ "_decoder.wav_decoder.model.4.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
201
+ "_decoder.wav_decoder.model.4.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
202
+ "_decoder.wav_decoder.model.4.block.4.block.0.alpha": "model-00002-of-00002.safetensors",
203
+ "_decoder.wav_decoder.model.4.block.4.block.1.bias": "model-00002-of-00002.safetensors",
204
+ "_decoder.wav_decoder.model.4.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
205
+ "_decoder.wav_decoder.model.4.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
206
+ "_decoder.wav_decoder.model.4.block.4.block.2.alpha": "model-00002-of-00002.safetensors",
207
+ "_decoder.wav_decoder.model.4.block.4.block.3.bias": "model-00002-of-00002.safetensors",
208
+ "_decoder.wav_decoder.model.4.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
209
+ "_decoder.wav_decoder.model.4.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
210
+ "_decoder.wav_decoder.model.5.alpha": "model-00002-of-00002.safetensors",
211
+ "_decoder.wav_decoder.model.6.bias": "model-00002-of-00002.safetensors",
212
+ "_decoder.wav_decoder.model.6.parametrizations.weight.original0": "model-00002-of-00002.safetensors",
213
+ "_decoder.wav_decoder.model.6.parametrizations.weight.original1": "model-00002-of-00002.safetensors",
214
+ "acoustic_mask_emb.weight": "model-00002-of-00002.safetensors",
215
+ "acoustic_proj.bias": "model-00002-of-00002.safetensors",
216
+ "acoustic_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
225
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
232
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
234
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
235
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
237
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
242
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
244
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
247
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
256
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
258
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
259
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
261
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
273
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
274
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
275
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
276
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
277
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
278
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
279
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
280
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
281
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
282
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
283
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
284
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
285
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
286
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
292
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
294
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
295
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
296
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
297
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
299
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
300
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
301
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
302
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
303
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
304
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
305
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
306
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
307
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
308
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
309
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
310
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
311
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
312
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
313
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
314
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
315
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
316
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
317
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
318
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
319
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
320
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
321
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
322
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
323
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
324
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
325
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
326
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
327
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
328
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
329
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
330
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
331
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
332
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
333
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
334
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
335
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
338
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
340
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
352
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
364
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
367
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
369
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
370
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
371
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
372
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
373
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
374
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
375
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
376
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
377
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
378
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
379
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
380
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
381
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
382
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
383
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
384
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
385
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
386
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
387
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
388
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
389
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
390
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
391
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
392
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
393
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
394
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
395
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
396
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
397
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
398
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
399
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
400
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
401
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
402
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
403
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
404
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
405
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
406
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
407
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
408
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
409
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
410
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
412
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
414
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
415
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
416
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
417
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
418
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
420
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
421
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
422
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
424
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
425
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
426
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
427
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
428
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
429
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
430
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
433
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
434
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
436
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
437
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
438
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
439
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
440
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
441
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
442
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
443
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
444
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
445
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
446
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
447
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
448
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
449
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
450
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
451
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
452
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
453
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
454
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
455
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
456
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
457
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
458
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
459
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
460
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
461
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
462
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
463
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
464
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
465
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
466
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
467
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
468
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
469
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
470
+ "model.norm.weight": "model-00002-of-00002.safetensors",
471
+ "prediction_head.cond_proj.weight": "model-00002-of-00002.safetensors",
472
+ "prediction_head.final_layer.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
473
+ "prediction_head.final_layer.linear.weight": "model-00002-of-00002.safetensors",
474
+ "prediction_head.layers.0.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
475
+ "prediction_head.layers.0.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
476
+ "prediction_head.layers.0.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
477
+ "prediction_head.layers.0.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
478
+ "prediction_head.layers.0.norm.weight": "model-00002-of-00002.safetensors",
479
+ "prediction_head.layers.1.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
480
+ "prediction_head.layers.1.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
481
+ "prediction_head.layers.1.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
482
+ "prediction_head.layers.1.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
483
+ "prediction_head.layers.1.norm.weight": "model-00002-of-00002.safetensors",
484
+ "prediction_head.layers.2.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
485
+ "prediction_head.layers.2.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
486
+ "prediction_head.layers.2.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
487
+ "prediction_head.layers.2.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
488
+ "prediction_head.layers.2.norm.weight": "model-00002-of-00002.safetensors",
489
+ "prediction_head.layers.3.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
490
+ "prediction_head.layers.3.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
491
+ "prediction_head.layers.3.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
492
+ "prediction_head.layers.3.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
493
+ "prediction_head.layers.3.norm.weight": "model-00002-of-00002.safetensors",
494
+ "prediction_head.layers.4.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
495
+ "prediction_head.layers.4.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
496
+ "prediction_head.layers.4.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
497
+ "prediction_head.layers.4.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
498
+ "prediction_head.layers.4.norm.weight": "model-00002-of-00002.safetensors",
499
+ "prediction_head.layers.5.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors",
500
+ "prediction_head.layers.5.ffn.down_proj.weight": "model-00002-of-00002.safetensors",
501
+ "prediction_head.layers.5.ffn.gate_proj.weight": "model-00002-of-00002.safetensors",
502
+ "prediction_head.layers.5.ffn.up_proj.weight": "model-00002-of-00002.safetensors",
503
+ "prediction_head.layers.5.norm.weight": "model-00002-of-00002.safetensors",
504
+ "prediction_head.noisy_images_proj.weight": "model-00002-of-00002.safetensors",
505
+ "prediction_head.t_embedder.mlp.0.weight": "model-00002-of-00002.safetensors",
506
+ "prediction_head.t_embedder.mlp.2.weight": "model-00002-of-00002.safetensors",
507
+ "time_end_embed.weight": "model-00002-of-00002.safetensors",
508
+ "time_start_embed.weight": "model-00002-of-00002.safetensors"
509
+ }
510
+ }
multi/tada-3b-ml/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/HumeAI/tada-3b-ml
multi/tada-codec/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
multi/tada-codec/README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ tags:
6
+ - tts
7
+ - text-to-speech
8
+ - speech-language-model
9
+ arxiv: 2602.23068
10
+ ---
11
+
12
+ <h1 align="center">TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment</h1>
13
+
14
+ <p align="center">
15
+ <a href="https://arxiv.org/abs/2602.23068"><img src="https://img.shields.io/badge/arXiv-Paper-b31b1b.svg" alt="Paper"></a>
16
+ <a href="https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-pink" alt="Demo"></a>
17
+ <a href="https://huggingface.co/spaces/HumeAI/tada"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue" alt="Demo"></a>
18
+ <a href="https://huggingface.co/collections/HumeAI/tada-684390a52ed08a7717b5e702"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collection-yellow" alt="Collection"></a>
19
+ <a href="https://pypi.org/project/hume-tada/"><img src="https://img.shields.io/badge/PyPI-hume--tada-3775A9.svg?logo=pypi&logoColor=white" alt="PyPI"></a>
20
+ <a href="https://www.hume.ai/blog/opensource-tada"><img src="https://img.shields.io/badge/Blog-Post-orange.svg" alt="Blog"></a>
21
+ <a href="https://github.com/HumeAI/tada/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License"></a>
22
+ </p>
23
+
24
+ <img width="2400" height="1260" alt="image" src="https://github.com/user-attachments/assets/800eb8c5-eb6f-4e03-b8f3-150055a6cdfc" />
25
+
26
+ <p align="center"><br/><em>A unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment.</em></p>
27
+
28
+ ---
29
+
30
+ # Text-Acoustic Dual-Alignment Large Language Model
31
+
32
+ TADA is a unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment. By leveraging a novel tokenizer and architectural design, TADA achieves high-fidelity synthesis and generation with a fraction of the computational overhead required by traditional models.
33
+
34
+ ⭐️ arxiv: https://arxiv.org/abs/2602.23068 \
35
+ ⭐️ demo1: https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo \
36
+ ⭐️ demo2: https://huggingface.co/spaces/HumeAI/tada \
37
+ ⭐️ github: https://github.com/HumeAI/tada \
38
+ ⭐️ blog post: https://www.hume.ai/blog/opensource-tada
39
+
40
+
multi/tada-codec/aligner-ar/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-ar/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17265871d0191d812e1170d47428f4c5f29bff0e09f1e0b14795a7c03fb3285b
3
+ size 893836552
multi/tada-codec/aligner-ch/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-ch/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69716fc909911653a332808e15a8d62ac9740b83a8e1e93d0e5b86d7052346c2
3
+ size 893836552
multi/tada-codec/aligner-de/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-de/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cc23a3135d471570fb1d09d1c24acbcbdb6a6e582820c3091d66cdb73e63262
3
+ size 893836552
multi/tada-codec/aligner-es/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-es/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e028340e2b74e35f6166b8823b34a7271e1e52b64b73eea9db7dfa95de483c7
3
+ size 893836552
multi/tada-codec/aligner-fr/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-fr/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:912b4eaedc453eebaa9ae38512dffa1f693fe3b1e3f7590fe8d094e15ef559f8
3
+ size 893836552
multi/tada-codec/aligner-it/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-it/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e39abbb1cbfb044db0d1be7cc444b555c7d911779a33e24780cc392c42909e4
3
+ size 893836552
multi/tada-codec/aligner-ja/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-ja/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7801c18c7feda94d83d7041c8f4a69c9c65417b6500cfe30da87d48677dc27e7
3
+ size 893836552
multi/tada-codec/aligner-pl/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-pl/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118f62f1185512730ae8462f16e67b78362bd47c37e90b51b8183523993b2943
3
+ size 893836552
multi/tada-codec/aligner-pt/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner-pt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f5eefcaeb778ba39eb020e02f8caad9dd903b96cd8b676f47e11d5fad433e6
3
+ size 893836552
multi/tada-codec/aligner/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Aligner"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/aligner/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d58d2193c99bbbb4b6195dacf3b0682c0934df4a8a96b9498ccadef976cdfd4
3
+ size 893836552
multi/tada-codec/decoder/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Decoder"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/decoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d1e4780f155df5a03836c7d1f55b38a1f8c6e6f0cafbc99b4ef24e01271b42
3
+ size 652676628
multi/tada-codec/encoder/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Encoder"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "transformers_version": "4.57.3"
7
+ }
multi/tada-codec/encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4654059c056b8c05fcf136ce55c911ac4a3fe2bd106c8e021a4643338999c47e
3
+ size 601866928