diff --git a/multi/tada-1b/.gitattributes b/multi/tada-1b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/multi/tada-1b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/multi/tada-1b/README.md b/multi/tada-1b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..30cec839747249be6d9906e7b80d7df50d296667 --- /dev/null +++ b/multi/tada-1b/README.md @@ -0,0 +1,154 @@ +--- +license: mit +language: +- en +tags: +- tts +- text-to-speech +- speech-language-model +arxiv: 2602.23068 +--- + +

TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment

+ +

+ Paper + Demo + Demo + Collection + PyPI + Blog + License +

+ +image + +


A unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment.

+ +--- + +# Text-Acoustic Dual-Alignment Large Language Model + +TADA is a unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment. By leveraging a novel tokenizer and architectural design, TADA achieves high-fidelity synthesis and generation with a fraction of the computational overhead required by traditional models. + +⭐️ arxiv: https://arxiv.org/abs/2602.23068 \ +⭐️ demo1: https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo \ +⭐️ demo2: https://huggingface.co/spaces/HumeAI/tada \ +⭐️ github: https://github.com/HumeAI/tada \ +⭐️ blog post: https://www.hume.ai/blog/opensource-tada \ + +## Key Features + +- 1:1 Token Alignment: Unlike standard models, TADA’s tokenizer encodes audio into a sequence of vectors that perfectly matches the number of text tokens. +- Dynamic Duration Synthesis: As a TTS model, it generates the full speech segment for a text token in a single autoregressive step, regardless of length. This eliminates the need for fixed-frame-rate processing. +- Dual-Stream Generation: In speech-language modeling mode, it generates a text token and the speech for the preceding token simultaneously, maintaining the same context length and minimal overhead compared to text-only generation. +- Efficiency & Reliability: TADA delivers superior expressiveness and natural flow while significantly reducing the computational cost associated with fixed audio frame rates. + +## How It Works + +### The Tokenization Schema + +TADA unifies modalities by ensuring that for every word or subword token, there is exactly one corresponding speech vector. This synchronized stream allows the model to "understand" the precise timing of speech relative to text. + +### Dynamic Autoregression + +Most TTS models require a fixed number of steps to produce one second of audio (e.g., 50 frames per second). TADA breaks this constraint: + +- Each autoregressive step covers one text token. +- The model dynamically determines the duration and prosody for that specific token. +- This results in a more natural flow and eliminates transcript hallucination. + +## Installation + +From the github repo + +```bash +pip install git+https://github.com/HumeAI/tada.git +``` + +From source + +```bash +pip install -e . +``` + +## Models + +We provide several model checkpoints: + +| Model | Base Model | HuggingFace Hub | +| ------- | ------------ | --------------------------------------------------------- | +| TADA-1B | Llama 3.2 1B | [`HumeAI/tada-1b`](https://huggingface.co/HumeAI/tada-1b) | +| TADA-3B-ml | Llama 3.2 3B | [`HumeAI/tada-3b-ml`](https://huggingface.co/HumeAI/tada-3b-ml) | + +All models use the same encoder ([`HumeAI/tada-codec`](https://huggingface.co/HumeAI/tada-codec)) and can be loaded using the same API. + +## Evaluation + + + + + + + + + + + +
CERSpeed
Speaker SimilarityMOS
+ +## Run Inferece + +### Text-to-speech + +```python +import torch +import torchaudio + +from tada.modules.encoder import Encoder +from tada.modules.tada import TadaForCausalLM + +device = "cuda" +encoder = Encoder.from_pretrained("HumeAI/tada-codec", subfolder="encoder").to(device) +model = TadaForCausalLM.from_pretrained("HumeAI/tada-1b").to(device) + +audio, sample_rate = torchaudio.load("samples/ljspeech.wav") +audio = audio.to(device) +prompt_text = "The examination and testimony of the experts, enabled the commission to conclude that five shots may have been fired." +prompt = encoder( + audio, text=[prompt_text], sample_rate=sample_rate +) + +output = model.generate( + prompt=prompt, + text="Please call Stella. Ask her to bring these things with her from the store.", +) +``` + +### Speech continuation + +Provide `num_extra_steps` if you want to generate text+speech continuation of the prompt + +```python +output = model.generate( + prompt=prompt, + num_extra_steps=50 +) +``` + +## 📚 Citation + +If you use this project in your research, please cite our paper: + +```bibtex +@article{dang2026tada, + title={TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment}, + author={Dang, Trung and Rao, Sharath and Gupta, Ananya and Gagne, Christopher and Tzirakis, Panagiotis and Baird, Alice and Cłapa, Jakub Piotr and Chin, Peter and Cowen, Alan}, + journal={arXiv preprint arXiv:2602.23068}, + year={2026} +} +``` + +## Contact + +Hume AI is an empathic AI research company. We research the datasets, tools, and models needed to give empathy to AI models to serve human wellbeing. If you're interested in any of our product or research collaborations, please reach out to us at hello@hume.ai \ No newline at end of file diff --git a/multi/tada-1b/config.json b/multi/tada-1b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbc9ff8c8b010ba43a910569f7e8f6fe35ae2643 --- /dev/null +++ b/multi/tada-1b/config.json @@ -0,0 +1,49 @@ +{ + "acoustic_dim": 512, + "acoustic_from_nth_hidden_state": -1, + "acoustic_mean": 0.0, + "acoustic_std": 1.5, + "add_semantic_to_condition": 0.0, + "architectures": [ + "TadaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "bottleneck_dim": null, + "context_window": 8, + "diffusion_head_type": "vibevoice", + "dist_type": "fixed", + "dtype": "bfloat16", + "eos_token_id": 128001, + "head_dim": 64, + "head_ffn_ratio": 4.0, + "head_layers": 6, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "latent_dropout": 0.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "num_time_classes": 256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "shift_acoustic": 5, + "tie_word_embeddings": true, + "transformers_version": "4.57.3", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/multi/tada-1b/generation_config.json b/multi/tada-1b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..02373ed88235b08a58139b96da788b79e24ccae8 --- /dev/null +++ b/multi/tada-1b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": 128001, + "transformers_version": "4.57.3" +} diff --git a/multi/tada-1b/graphics/CER.png b/multi/tada-1b/graphics/CER.png new file mode 100644 index 0000000000000000000000000000000000000000..a1eb093b85f5db9ee7c7816ae08a9ae4a270321a Binary files /dev/null and b/multi/tada-1b/graphics/CER.png differ diff --git a/multi/tada-1b/graphics/naturalness.png b/multi/tada-1b/graphics/naturalness.png new file mode 100644 index 0000000000000000000000000000000000000000..de261015e604f0d0fbe416a7232d81d868d6ea29 Binary files /dev/null and b/multi/tada-1b/graphics/naturalness.png differ diff --git a/multi/tada-1b/graphics/real-time.png b/multi/tada-1b/graphics/real-time.png new file mode 100644 index 0000000000000000000000000000000000000000..a7530bcec02f26826d33fc62cfad4ca55043dab7 Binary files /dev/null and b/multi/tada-1b/graphics/real-time.png differ diff --git a/multi/tada-1b/graphics/speaker-sim.png b/multi/tada-1b/graphics/speaker-sim.png new file mode 100644 index 0000000000000000000000000000000000000000..b66c52c7f8b2f146f762cfffcc50ac2d855a4735 Binary files /dev/null and b/multi/tada-1b/graphics/speaker-sim.png differ diff --git a/multi/tada-1b/issues.txt b/multi/tada-1b/issues.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ba659ad0b079d7a5c8695ef26e5da4ba48cc903 --- /dev/null +++ b/multi/tada-1b/issues.txt @@ -0,0 +1,14 @@ +------------------------------------------------------------- +#2 Installation Video and Testing - Step by Step +------------------------------------------------------------- + +[fahdmirzac] 12 Mar 2026 + +Hi, +Kudos on producing such a sublime model. I did a local installation and testing video : + +TADA: This Free Speech Model Just Broke the Rules of TTS - Local Demo +https://www.youtube.com/watch?v=DgX43zSjnB0 + +Thanks and regards, +Fahd \ No newline at end of file diff --git a/multi/tada-1b/model.safetensors b/multi/tada-1b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc09ddcafc4d44a81c7501e22c8b3406efab31a --- /dev/null +++ b/multi/tada-1b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32063b83702a6a9f7527c1541b4e9adb24433f0b3563f4bd8345aebbb8282c39 +size 3922463244 diff --git a/multi/tada-1b/source.txt b/multi/tada-1b/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..962b410ace44469bf715a5c4f84bec5515d3efdf --- /dev/null +++ b/multi/tada-1b/source.txt @@ -0,0 +1 @@ +https://huggingface.co/HumeAI/tada-1b \ No newline at end of file diff --git a/multi/tada-3b-ml/.gitattributes b/multi/tada-3b-ml/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/multi/tada-3b-ml/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/multi/tada-3b-ml/config.json b/multi/tada-3b-ml/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ba194dbb0497bfc308269d1b38698c0c08782856 --- /dev/null +++ b/multi/tada-3b-ml/config.json @@ -0,0 +1,53 @@ +{ + "acoustic_dim": 512, + "acoustic_from_nth_hidden_state": -1, + "acoustic_mean": 0.0, + "acoustic_std": 1.5, + "add_semantic_to_condition": 0.0, + "architectures": [ + "TadaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "bottleneck_dim": null, + "context_window": 8, + "diffusion_head_type": "vibevoice", + "dist_type": "fixed", + "dtype": "bfloat16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "head_ffn_ratio": 4.0, + "head_layers": 6, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "latent_dropout": 0.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_time_classes": 256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "shift_acoustic": 5, + "tie_word_embeddings": true, + "transformers_version": "4.57.3", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/multi/tada-3b-ml/final-graphics-polished/.DS_Store b/multi/tada-3b-ml/final-graphics-polished/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..142fda72741bd31cd5448bee0a33083e0717db33 Binary files /dev/null and b/multi/tada-3b-ml/final-graphics-polished/.DS_Store differ diff --git a/multi/tada-3b-ml/final-graphics-polished/CER.png b/multi/tada-3b-ml/final-graphics-polished/CER.png new file mode 100644 index 0000000000000000000000000000000000000000..a1eb093b85f5db9ee7c7816ae08a9ae4a270321a Binary files /dev/null and b/multi/tada-3b-ml/final-graphics-polished/CER.png differ diff --git a/multi/tada-3b-ml/final-graphics-polished/MOS.png b/multi/tada-3b-ml/final-graphics-polished/MOS.png new file mode 100644 index 0000000000000000000000000000000000000000..b66c52c7f8b2f146f762cfffcc50ac2d855a4735 Binary files /dev/null and b/multi/tada-3b-ml/final-graphics-polished/MOS.png differ diff --git a/multi/tada-3b-ml/final-graphics-polished/naturalness.png b/multi/tada-3b-ml/final-graphics-polished/naturalness.png new file mode 100644 index 0000000000000000000000000000000000000000..de261015e604f0d0fbe416a7232d81d868d6ea29 Binary files /dev/null and b/multi/tada-3b-ml/final-graphics-polished/naturalness.png differ diff --git a/multi/tada-3b-ml/final-graphics-polished/real-time.png b/multi/tada-3b-ml/final-graphics-polished/real-time.png new file mode 100644 index 0000000000000000000000000000000000000000..a7530bcec02f26826d33fc62cfad4ca55043dab7 Binary files /dev/null and b/multi/tada-3b-ml/final-graphics-polished/real-time.png differ diff --git a/multi/tada-3b-ml/generation_config.json b/multi/tada-3b-ml/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d8ec1f7155d412d60b6f2b8efbb922b95f99ff --- /dev/null +++ b/multi/tada-3b-ml/generation_config.json @@ -0,0 +1,10 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "transformers_version": "4.57.3" +} diff --git a/multi/tada-3b-ml/languages.txt b/multi/tada-3b-ml/languages.txt new file mode 100644 index 0000000000000000000000000000000000000000..4314124d5858b4636fd8f7ab7e6940ed09fb4d3c --- /dev/null +++ b/multi/tada-3b-ml/languages.txt @@ -0,0 +1,10 @@ +English +Japanese +German +French +Spanish +Chamorro +Arabic +Italian +Polish +Portuguese \ No newline at end of file diff --git a/multi/tada-3b-ml/model-00001-of-00002.safetensors b/multi/tada-3b-ml/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..01fd1aa6acd3a13718a0a12cb003b1d14de9e1b5 --- /dev/null +++ b/multi/tada-3b-ml/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639a29ca27fdfe5c0a5bb18d24b47a72a049310ccd6c2ff4ca768640e1766470 +size 4965799096 diff --git a/multi/tada-3b-ml/model-00002-of-00002.safetensors b/multi/tada-3b-ml/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fd76bad48d507e7ac73728d1479de5290f8357a8 --- /dev/null +++ b/multi/tada-3b-ml/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52b2b31e1541d1703a562b48fc5872505518d264e9bcc829352cefac9ec284b4 +size 3901011596 diff --git a/multi/tada-3b-ml/model.safetensors.index.json b/multi/tada-3b-ml/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..eea702a8da3b3df0493541531e95d2bbdd2a8b87 --- /dev/null +++ b/multi/tada-3b-ml/model.safetensors.index.json @@ -0,0 +1,510 @@ +{ + "metadata": { + "total_parameters": 4225756258, + "total_size": 8866748612 + }, + "weight_map": { + "_decoder.decoder_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.decoder_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.final_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.final_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.0.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.1.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.2.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.3.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.4.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.ffn.0.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.ffn.0.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.ffn.3.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.ffn.3.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn._precomputed_mask": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.layer_norm.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.layer_norm.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.qkv.bias": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.qkv.weight": "model-00002-of-00002.safetensors", + "_decoder.local_attention_decoder.layers.5.self_attn.rope_freqs": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.0.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.0.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.0.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.1.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.2.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.3.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.2.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.3.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.0.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.1.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.1.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.1.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.2.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.3.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.3.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.4.block.4.block.3.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.5.alpha": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.6.bias": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.6.parametrizations.weight.original0": "model-00002-of-00002.safetensors", + "_decoder.wav_decoder.model.6.parametrizations.weight.original1": "model-00002-of-00002.safetensors", + "acoustic_mask_emb.weight": "model-00002-of-00002.safetensors", + "acoustic_proj.bias": "model-00002-of-00002.safetensors", + "acoustic_proj.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.cond_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.final_layer.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.final_layer.linear.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.0.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.0.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.0.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.0.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.0.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.1.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.1.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.1.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.1.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.1.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.2.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.2.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.2.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.2.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.2.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.3.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.3.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.3.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.3.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.3.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.4.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.4.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.4.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.4.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.4.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.5.adaLN_modulation.1.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.5.ffn.down_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.5.ffn.gate_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.5.ffn.up_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.layers.5.norm.weight": "model-00002-of-00002.safetensors", + "prediction_head.noisy_images_proj.weight": "model-00002-of-00002.safetensors", + "prediction_head.t_embedder.mlp.0.weight": "model-00002-of-00002.safetensors", + "prediction_head.t_embedder.mlp.2.weight": "model-00002-of-00002.safetensors", + "time_end_embed.weight": "model-00002-of-00002.safetensors", + "time_start_embed.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/multi/tada-3b-ml/source.txt b/multi/tada-3b-ml/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..547796471576f34d394c26f39d948337e5bddce0 --- /dev/null +++ b/multi/tada-3b-ml/source.txt @@ -0,0 +1 @@ +https://huggingface.co/HumeAI/tada-3b-ml \ No newline at end of file diff --git a/multi/tada-codec/.gitattributes b/multi/tada-codec/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422 --- /dev/null +++ b/multi/tada-codec/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/multi/tada-codec/README.md b/multi/tada-codec/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e803dad5972ee7bf5e04543e2259e4265b5100ec --- /dev/null +++ b/multi/tada-codec/README.md @@ -0,0 +1,40 @@ +--- +license: mit +language: + - en +tags: + - tts + - text-to-speech + - speech-language-model +arxiv: 2602.23068 +--- + +

TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment

+ +

+ Paper + Demo + Demo + Collection + PyPI + Blog + License +

+ +image + +


A unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment.

+ +--- + +# Text-Acoustic Dual-Alignment Large Language Model + +TADA is a unified speech-language model that synchronizes speech and text into a single, cohesive stream via 1:1 alignment. By leveraging a novel tokenizer and architectural design, TADA achieves high-fidelity synthesis and generation with a fraction of the computational overhead required by traditional models. + +⭐️ arxiv: https://arxiv.org/abs/2602.23068 \ +⭐️ demo1: https://huggingface.co/spaces/fffiloni/tada-dual-alignment-tts-demo \ +⭐️ demo2: https://huggingface.co/spaces/HumeAI/tada \ +⭐️ github: https://github.com/HumeAI/tada \ +⭐️ blog post: https://www.hume.ai/blog/opensource-tada + + diff --git a/multi/tada-codec/aligner-ar/config.json b/multi/tada-codec/aligner-ar/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-ar/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-ar/model.safetensors b/multi/tada-codec/aligner-ar/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c88e4ccf4aea450c9692bf5fcaa99adfa12c97bc --- /dev/null +++ b/multi/tada-codec/aligner-ar/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17265871d0191d812e1170d47428f4c5f29bff0e09f1e0b14795a7c03fb3285b +size 893836552 diff --git a/multi/tada-codec/aligner-ch/config.json b/multi/tada-codec/aligner-ch/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-ch/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-ch/model.safetensors b/multi/tada-codec/aligner-ch/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1fa4cc94a0e870c21427f3c1864392187b986656 --- /dev/null +++ b/multi/tada-codec/aligner-ch/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69716fc909911653a332808e15a8d62ac9740b83a8e1e93d0e5b86d7052346c2 +size 893836552 diff --git a/multi/tada-codec/aligner-de/config.json b/multi/tada-codec/aligner-de/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-de/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-de/model.safetensors b/multi/tada-codec/aligner-de/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3bb2d39ecc2171b5d30b3888dc4b5ad3906b9405 --- /dev/null +++ b/multi/tada-codec/aligner-de/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc23a3135d471570fb1d09d1c24acbcbdb6a6e582820c3091d66cdb73e63262 +size 893836552 diff --git a/multi/tada-codec/aligner-es/config.json b/multi/tada-codec/aligner-es/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-es/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-es/model.safetensors b/multi/tada-codec/aligner-es/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..39c2490ce702cacb67e3f15a4ea3d5804f6e384f --- /dev/null +++ b/multi/tada-codec/aligner-es/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e028340e2b74e35f6166b8823b34a7271e1e52b64b73eea9db7dfa95de483c7 +size 893836552 diff --git a/multi/tada-codec/aligner-fr/config.json b/multi/tada-codec/aligner-fr/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-fr/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-fr/model.safetensors b/multi/tada-codec/aligner-fr/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f71a2710dfdf48c1b052283c592e30c3a2773d63 --- /dev/null +++ b/multi/tada-codec/aligner-fr/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:912b4eaedc453eebaa9ae38512dffa1f693fe3b1e3f7590fe8d094e15ef559f8 +size 893836552 diff --git a/multi/tada-codec/aligner-it/config.json b/multi/tada-codec/aligner-it/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-it/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-it/model.safetensors b/multi/tada-codec/aligner-it/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c1949221242df0e63397cf1f42a03dcdc245445b --- /dev/null +++ b/multi/tada-codec/aligner-it/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e39abbb1cbfb044db0d1be7cc444b555c7d911779a33e24780cc392c42909e4 +size 893836552 diff --git a/multi/tada-codec/aligner-ja/config.json b/multi/tada-codec/aligner-ja/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-ja/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-ja/model.safetensors b/multi/tada-codec/aligner-ja/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d851475529497c8398cbc314a8b5c56c4b825b68 --- /dev/null +++ b/multi/tada-codec/aligner-ja/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7801c18c7feda94d83d7041c8f4a69c9c65417b6500cfe30da87d48677dc27e7 +size 893836552 diff --git a/multi/tada-codec/aligner-pl/config.json b/multi/tada-codec/aligner-pl/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-pl/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-pl/model.safetensors b/multi/tada-codec/aligner-pl/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..17b343f48ecb2058a8b01ce5e192aebc617a007e --- /dev/null +++ b/multi/tada-codec/aligner-pl/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118f62f1185512730ae8462f16e67b78362bd47c37e90b51b8183523993b2943 +size 893836552 diff --git a/multi/tada-codec/aligner-pt/config.json b/multi/tada-codec/aligner-pt/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner-pt/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner-pt/model.safetensors b/multi/tada-codec/aligner-pt/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7e181833e778ed91cd3b9e7bd3a229f18a8fe6d1 --- /dev/null +++ b/multi/tada-codec/aligner-pt/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f5eefcaeb778ba39eb020e02f8caad9dd903b96cd8b676f47e11d5fad433e6 +size 893836552 diff --git a/multi/tada-codec/aligner/config.json b/multi/tada-codec/aligner/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e257c2e42e19fbf83956c0db3574859639b57e5 --- /dev/null +++ b/multi/tada-codec/aligner/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Aligner" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/aligner/model.safetensors b/multi/tada-codec/aligner/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6f871d43914062511809accc3ca855cfc0a7c8e --- /dev/null +++ b/multi/tada-codec/aligner/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d58d2193c99bbbb4b6195dacf3b0682c0934df4a8a96b9498ccadef976cdfd4 +size 893836552 diff --git a/multi/tada-codec/decoder/config.json b/multi/tada-codec/decoder/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0007e9b2536d5c1cbfd1446ede8796e2d3d23550 --- /dev/null +++ b/multi/tada-codec/decoder/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Decoder" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/decoder/model.safetensors b/multi/tada-codec/decoder/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f9459820f2b35763c4dad80d649c734a76519b3 --- /dev/null +++ b/multi/tada-codec/decoder/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33d1e4780f155df5a03836c7d1f55b38a1f8c6e6f0cafbc99b4ef24e01271b42 +size 652676628 diff --git a/multi/tada-codec/encoder/config.json b/multi/tada-codec/encoder/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d2eba255c1e89579f44642866c143c47c8825806 --- /dev/null +++ b/multi/tada-codec/encoder/config.json @@ -0,0 +1,7 @@ +{ + "architectures": [ + "Encoder" + ], + "dtype": "bfloat16", + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/encoder/model.safetensors b/multi/tada-codec/encoder/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..612e5ceaadb50571e593fcb7dbe60797e3aa74b2 --- /dev/null +++ b/multi/tada-codec/encoder/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4654059c056b8c05fcf136ce55c911ac4a3fe2bd106c8e021a4643338999c47e +size 601866928 diff --git a/multi/tada-codec/llama_decoder/config.json b/multi/tada-codec/llama_decoder/config.json new file mode 100644 index 0000000000000000000000000000000000000000..90323f35dee332ea8db0123038cb40c00acf1d24 --- /dev/null +++ b/multi/tada-codec/llama_decoder/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaDecoder" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "dtype": "bfloat16", + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_num_frames": 256, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_codebooks": 8, + "num_hidden_layers": 16, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.3", + "use_cache": true, + "vocab_size": 1024 +} diff --git a/multi/tada-codec/llama_decoder/generation_config.json b/multi/tada-codec/llama_decoder/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e063347b3fe4fdb170c2d11d9954a9f22df5756b --- /dev/null +++ b/multi/tada-codec/llama_decoder/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/llama_decoder/model.safetensors b/multi/tada-codec/llama_decoder/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f4881d2721a09cb52ae7a8ff3afef76d1347c01 --- /dev/null +++ b/multi/tada-codec/llama_decoder/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2610646ae4afbd6485699cbf021fa23a10c542fceb3afe05487e67e5a2636d9e +size 527047360 diff --git a/multi/tada-codec/source.txt b/multi/tada-codec/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdf04d2aae1f3fbb5732d8678c617669fe63bb2b --- /dev/null +++ b/multi/tada-codec/source.txt @@ -0,0 +1 @@ +https://huggingface.co/HumeAI/tada-codec \ No newline at end of file diff --git a/multi/tada-codec/spkr-verf/config.json b/multi/tada-codec/spkr-verf/config.json new file mode 100644 index 0000000000000000000000000000000000000000..adec17f1e98f130e4dd87dd18da09e5fc4db7deb --- /dev/null +++ b/multi/tada-codec/spkr-verf/config.json @@ -0,0 +1,13 @@ +{ + "architectures": [ + "AcousticSpkrVerfModel" + ], + "dropout": 0.1, + "dtype": "float32", + "embed_dim": 192, + "hidden_dim": 768, + "input_dim": 512, + "model_type": "acoustic_spkr_verf", + "num_layers": 3, + "transformers_version": "4.57.3" +} diff --git a/multi/tada-codec/spkr-verf/model.safetensors b/multi/tada-codec/spkr-verf/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e77b8525d93f0f085eee7a538c27067e91a9cb69 --- /dev/null +++ b/multi/tada-codec/spkr-verf/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d56f80a2ba2eda6f71e9e1f092769cb49f85f9e63c0a6d4e3edafd8291290c +size 4542000