niobures commited on
Commit
a42d6c3
·
verified ·
1 Parent(s): d0583ca

OuteTTS (code, models, paper)

Browse files
Files changed (40) hide show
  1. .gitattributes +7 -0
  2. ClonEval. An Open Voice Cloning Benchmark.pdf +3 -0
  3. code/ComfyUI_OuteTTS.zip +3 -0
  4. code/OuteTTS.zip +3 -0
  5. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/.gitattributes +37 -0
  6. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/README.md +294 -0
  7. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/genai_config.json +50 -0
  8. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/model.onnx +3 -0
  9. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/model.onnx.data +3 -0
  10. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/source.txt +1 -0
  11. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/special_tokens_map.json +0 -0
  12. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/tokenizer.json +3 -0
  13. models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/tokenizer_config.json +0 -0
  14. models/multi/Llama-OuteTTS-1.0-1B-ONNX/.gitattributes +38 -0
  15. models/multi/Llama-OuteTTS-1.0-1B-ONNX/README.md +34 -0
  16. models/multi/Llama-OuteTTS-1.0-1B-ONNX/config.json +46 -0
  17. models/multi/Llama-OuteTTS-1.0-1B-ONNX/generation_config.json +13 -0
  18. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model.onnx +3 -0
  19. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model.onnx_data +3 -0
  20. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_bnb4.onnx +3 -0
  21. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_fp16.onnx +3 -0
  22. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_fp16.onnx_data +3 -0
  23. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_int8.onnx +3 -0
  24. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_q4.onnx +3 -0
  25. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_q4f16.onnx +3 -0
  26. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_quantized.onnx +3 -0
  27. models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_uint8.onnx +3 -0
  28. models/multi/Llama-OuteTTS-1.0-1B-ONNX/source.txt +1 -0
  29. models/multi/Llama-OuteTTS-1.0-1B-ONNX/special_tokens_map.json +0 -0
  30. models/multi/Llama-OuteTTS-1.0-1B-ONNX/tokenizer.json +3 -0
  31. models/multi/Llama-OuteTTS-1.0-1B-ONNX/tokenizer_config.json +0 -0
  32. models/multi/Llama-OuteTTS-1.0-1B/.gitattributes +36 -0
  33. models/multi/Llama-OuteTTS-1.0-1B/README.md +247 -0
  34. models/multi/Llama-OuteTTS-1.0-1B/config.json +35 -0
  35. models/multi/Llama-OuteTTS-1.0-1B/generation_config.json +13 -0
  36. models/multi/Llama-OuteTTS-1.0-1B/model.safetensors +3 -0
  37. models/multi/Llama-OuteTTS-1.0-1B/source.txt +1 -0
  38. models/multi/Llama-OuteTTS-1.0-1B/special_tokens_map.json +0 -0
  39. models/multi/Llama-OuteTTS-1.0-1B/tokenizer.json +3 -0
  40. models/multi/Llama-OuteTTS-1.0-1B/tokenizer_config.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ClonEval.[[:space:]]An[[:space:]]Open[[:space:]]Voice[[:space:]]Cloning[[:space:]]Benchmark.pdf filter=lfs diff=lfs merge=lfs -text
37
+ models/multi/Llama-OuteTTS-1.0-1B[[:space:]](Prince-1)/model.onnx.data filter=lfs diff=lfs merge=lfs -text
38
+ models/multi/Llama-OuteTTS-1.0-1B[[:space:]](Prince-1)/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
40
+ models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
41
+ models/multi/Llama-OuteTTS-1.0-1B-ONNX/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ models/multi/Llama-OuteTTS-1.0-1B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
ClonEval. An Open Voice Cloning Benchmark.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6966c4de4a1178bcada8ab11dfe726396d3bcf030ba13fccf52b606faa1af8e
3
+ size 151200
code/ComfyUI_OuteTTS.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4bbb33603d1f3f5e162032ee88f9cf4721e0398d70a3533a958a9dc8f011da
3
+ size 270140
code/OuteTTS.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a736b5ad652f7474337091e48d4ac7e3a5003153f391ac7d2869607b0009630
3
+ size 10047606
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ model.onnx.data filter=lfs diff=lfs merge=lfs -text
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/README.md ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-sa-4.0
3
+ language:
4
+ - en
5
+ - ar
6
+ - zh
7
+ - nl
8
+ - fr
9
+ - de
10
+ - it
11
+ - ja
12
+ - ko
13
+ - lt
14
+ - ru
15
+ - es
16
+ - pt
17
+ - be
18
+ - bn
19
+ - ka
20
+ - hu
21
+ - lv
22
+ - fa
23
+ - pl
24
+ - sw
25
+ - ta
26
+ - uk
27
+ tags:
28
+ - unsloth
29
+ - onnx
30
+ - onnxruntime-genai
31
+ - onnxruntime
32
+ - tts
33
+ pipeline_tag: text-to-speech
34
+ library_name: onnxruntime-genai
35
+ base_model:
36
+ - unsloth/Llama-OuteTTS-1.0-1B
37
+ base_model_relation: quantized
38
+ ---
39
+ <div>
40
+ <p style="margin-bottom: 0; margin-top: 0;">
41
+ <strong>See <a href="https://huggingface.co/collections/unsloth/text-to-speech-tts-models-68007ab12522e96be1e02155">our collection</a> for all our TTS model uploads.</strong>
42
+ </p>
43
+ <p style="margin-bottom: 0;">
44
+ <em>Learn to fine-tune TTS models - <a href="https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning">Read our Guide</a>.</em>
45
+ </p>
46
+ <p style="margin-top: 0;margin-bottom: 0;">
47
+ <em><a href="https://docs.unsloth.ai/basics/unsloth-dynamic-v2.0-gguf">Unsloth Dynamic 2.0</a> achieves superior accuracy & outperforms other leading quants.</em>
48
+ </p>
49
+ <div style="display: flex; gap: 5px; align-items: center; ">
50
+ <a href="https://github.com/unslothai/unsloth/">
51
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="133">
52
+ </a>
53
+ <a href="https://discord.gg/unsloth">
54
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/Discord%20button.png" width="173">
55
+ </a>
56
+ <a href="https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning">
57
+ <img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation%20green%20button.png" width="143">
58
+ </a>
59
+ </div>
60
+ <h1 style="margin-top: 0rem;">✨ Run & Fine-tune TTS models with Unsloth!</h1>
61
+ </div>
62
+
63
+ - Fine-tune TTS models for free using our Google [Colab notebooks here](https://docs.unsloth.ai/get-started/unsloth-notebooks#text-to-speech-tts-notebooks)!
64
+ - Read our Blog about TTS support: [unsloth.ai/blog/tts](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning)
65
+
66
+ | Unsloth supports | Free Notebooks | Performance | Memory use |
67
+ |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
68
+ | **Oute-TTS** | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Oute_TTS_(1B).ipynb) | 1.5x faster | 58% less |
69
+ | **Whisper Large V3** | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb) | 1.5x faster | 50% less |
70
+ | **Qwen3 (14B)** | [▶️ Start on Colab](https://docs.unsloth.ai/get-started/unsloth-notebooks) | 2x faster | 70% less |
71
+ | **Llama 3.2 Vision (11B)** | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb) | 1.8x faster | 50% less |
72
+
73
+ <div class="p-4 bg-gray-50 dark:bg-gray-800 rounded-lg shadow-sm mb-12">
74
+ <div class="text-center mb-4">
75
+ <h2 class="text-xl font-light text-gray-900 dark:text-white tracking-tight mt-0 mb-0">Oute A I</h2>
76
+ <div class="flex justify-center gap-6 mt-4">
77
+ <a href="https://www.outeai.com/" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
78
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
79
+ <circle cx="12" cy="12" r="10"></circle>
80
+ <path d="M2 12h20M12 2a15.3 15.3 0 0 1 4 10 15.3 15.3 0 0 1-4 10 15.3 15.3 0 0 1-4-10 15.3 15.3 0 0 1 4-10z"></path>
81
+ </svg>
82
+ outeai.com
83
+ </a>
84
+ <a href="https://discord.gg/vyBM87kAmf" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
85
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
86
+ <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
87
+ </svg>
88
+ Discord
89
+ </a>
90
+ <a href="https://x.com/OuteAI" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
91
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
92
+ <path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path>
93
+ </svg>
94
+ @OuteAI
95
+ </a>
96
+ </div>
97
+ </div>
98
+
99
+ <div class="grid grid-cols-3 sm:grid-cols-3 gap-2">
100
+ <a href="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
101
+ Llama OuteTTS 1.0 1B
102
+ </a>
103
+ <a href="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B-GGUF" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
104
+ Llama OuteTTS 1.0 1B GGUF
105
+ </a>
106
+ <a href="https://github.com/edwko/OuteTTS" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
107
+ GitHub Library
108
+ </a>
109
+ </div>
110
+ </div>
111
+
112
+ > [!IMPORTANT]
113
+ > **Important Sampling Considerations**
114
+ >
115
+ > When using OuteTTS version 1.0, it is crucial to use the settings specified in the [Sampling Configuration](#sampling-configuration) section.
116
+ >
117
+ > The **repetition penalty implementation** is particularly important - this model requires penalization applied to a **64-token recent window**,
118
+ > rather than across the entire context window. Penalizing the entire context will cause the model to produce **broken or low-quality output**.
119
+ >
120
+ > Currently, **llama.cpp** delivers the most reliable and consistent output quality by default.
121
+ > Both **llama.cpp** and **EXL2** support this windowed sampling approach, while **Transformers** doesn't.
122
+ >
123
+ > To address this limitation, I've implemented a **windowed repetition penalty** for the **Hugging Face Transformers** backend in the **OuteTTS** library,
124
+ > which significantly improves output quality and resolves sampling issues, providing comparable results to llama.cpp.
125
+
126
+ # OuteTTS Version 1.0
127
+
128
+ This update brings significant improvements in speech synthesis and voice cloning—delivering a more powerful, accurate, and user-friendly experience in a compact size.
129
+
130
+ ## What's New
131
+
132
+ ### 1. Prompt Revamp & Dependency Removal
133
+ - **Automatic Word Alignment:** The model now performs word alignment internally. Simply input raw text—no pre-processing required—and the model handles the rest, streamlining your workflow. For optimal results, use normalized, readable text without newlines (light normalization is applied automatically in outetts library).
134
+ - **Native Multilingual Text Support:** Direct support for native text across multiple languages eliminates the need for romanization.
135
+ - **Enhanced Metadata Integration:** The updated prompt system incorporates additional metadata (time, energy, spectral centroid, pitch) at both global and word levels, improving speaker flow and synthesis quality.
136
+ - **Special Tokens for Audio Codebooks:** New tokens for c1 (codebook 1) and c2 (codebook 2).
137
+
138
+ ### 2. New Audio Encoder Model
139
+ - **DAC Encoder:** Integrates a DAC audio encoder from [ibm-research/DAC.speech.v1.0](https://huggingface.co/ibm-research/DAC.speech.v1.0), utilizing two codebooks for high quality audio reconstruction.
140
+ - **Performance Trade-off:** Improved audio fidelity increases the token generation rate from 75 to 150 tokens per second. This trade-off prioritizes quality, especially for multilingual applications.
141
+
142
+ ### 3. Voice Cloning
143
+ - **One-Shot Voice Cloning:** To achieve one-shot cloning, the model typically requires only around **10 seconds** of reference audio to produce an accurate voice representation.
144
+ - **Improved Accuracy:** Enhanced by the new encoder and additional training metadata, voice cloning is now more natural and precise.
145
+
146
+ ### 4. Auto Text Alignment & Numerical Support
147
+ - **Automatic Text Alignment:** Aligns raw text at the word level, even for languages without clear boundaries (e.g., Japanese, Chinese), using insights from pre-processed training data.
148
+ - **Direct Numerical Input:** Built-in multilingual numerical support allows direct use of numbers in prompts—no textual conversion needed. (The model typically chooses the dominant language present. Mixing languages in a single prompt may lead to mistakes.)
149
+
150
+ ### 5. Multilingual Capabilities
151
+
152
+ - **Supported Languages:** OuteTTS offers varying proficiency levels across languages, based on training data exposure.
153
+
154
+ - **High Training Data Languages:** These languages feature extensive training: **English, Arabic, Chinese, Dutch, French, German, Italian, Japanese, Korean, Lithuanian, Russian, Spanish**
155
+
156
+ - **Moderate Training Data Languages:** These languages received moderate training, offering good performance with occasional limitations: **Portuguese, Belarusian, Bengali, Georgian, Hungarian, Latvian, Persian/Farsi, Polish, Swahili, Tamil, Ukrainian**
157
+
158
+ - **Beyond Supported Languages:** The model can generate speech in untrained languages with varying success. Experiment with unlisted languages, though results may not be optimal.
159
+
160
+ ## Video Showcase
161
+
162
+ <video width="1280" height="720" controls style="box-shadow: 0px 0px 20px 10px rgba(0, 0, 0, 0.05), 0px 1px 3px 10px rgba(255, 255, 255, 0.05);">
163
+ <source src="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B-GGUF/resolve/main/media/showcase.mp4" type="video/mp4">
164
+ Your browser does not support the video tag.
165
+ </video>
166
+
167
+ ## Quick Start Guide
168
+
169
+ Getting started with **OuteTTS** is simple:
170
+
171
+ ### Installation
172
+
173
+ 🔗 [Installation instructions](https://github.com/edwko/OuteTTS?tab=readme-ov-file#installation)
174
+
175
+ ### Basic Usage
176
+ ```python
177
+ import outetts
178
+
179
+ # Initialize the interface
180
+ interface = outetts.Interface(
181
+ config=outetts.ModelConfig.auto_config(
182
+ model=outetts.Models.VERSION_1_0_SIZE_1B,
183
+ # For llama.cpp backend
184
+ backend=outetts.Backend.LLAMACPP,
185
+ quantization=outetts.LlamaCppQuantization.FP16
186
+ # For transformers backend
187
+ # backend=outetts.Backend.HF,
188
+ )
189
+ )
190
+
191
+ # Load the default speaker profile
192
+ speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL")
193
+
194
+ # Or create your own speaker profiles in seconds and reuse them instantly
195
+ # speaker = interface.create_speaker("path/to/audio.wav")
196
+ # interface.save_speaker(speaker, "speaker.json")
197
+ # speaker = interface.load_speaker("speaker.json")
198
+
199
+ # Generate speech
200
+ output = interface.generate(
201
+ config=outetts.GenerationConfig(
202
+ text="Hello, how are you doing?",
203
+ generation_type=outetts.GenerationType.CHUNKED,
204
+ speaker=speaker,
205
+ sampler_config=outetts.SamplerConfig(
206
+ temperature=0.4
207
+ ),
208
+ )
209
+ )
210
+
211
+ # Save to file
212
+ output.save("output.wav")
213
+ ```
214
+
215
+ ### More Configuration Options
216
+ For advanced settings and customization, visit the official repository:
217
+ 🔗 [interface_usage.md](https://github.com/edwko/OuteTTS/blob/main/docs/interface_usage.md)
218
+
219
+ ## Usage Recommendations
220
+
221
+ ### Speaker Reference
222
+ The model is designed to be used with a speaker reference. Without one, it generates random vocal characteristics, often leading to lower-quality outputs.
223
+ The model inherits the referenced speaker's emotion, style, and accent.
224
+ When transcribing to other languages with the same speaker, you may observe the model retaining the original accent.
225
+
226
+ ### Multilingual Application
227
+ It is recommended to create a speaker profile in the language you intend to use. This helps achieve the best results in that specific language, including tone, accent, and linguistic features.
228
+
229
+ While the model supports cross-lingual speech, it still relies on the reference speaker. If the speaker has a distinct accent—such as British English—other languages may carry that accent as well.
230
+
231
+ ### Optimal Audio Length
232
+ - **Best Performance:** Generate audio around **42 seconds** in a single run (approximately 8,192 tokens). It is recomended not to near the limits of this windows when generating. Usually, the best results are up to 7,000 tokens.
233
+ - **Context Reduction with Speaker Reference:** If the speaker reference is 10 seconds long, the effective context is reduced to approximately 32 seconds.
234
+
235
+ ### Temperature Setting Recommendations
236
+ Testing shows that a temperature of **0.4** is an ideal starting point for accuracy (with the sampling settings below). However, some voice references may benefit from higher temperatures for enhanced expressiveness or slightly lower temperatures for more precise voice replication.
237
+
238
+ ### Verifying Speaker Encoding
239
+ If the cloned voice quality is subpar, check the encoded speaker sample.
240
+
241
+ ```python
242
+ interface.decode_and_save_speaker(speaker=your_speaker, path="speaker.wav")
243
+ ```
244
+
245
+ The DAC audio reconstruction model is lossy, and samples with clipping, excessive loudness, or unusual vocal features may introduce encoding issues that impact output quality.
246
+
247
+ ### Sampling Configuration
248
+ For optimal results with this TTS model, use the following sampling settings.
249
+
250
+ | Parameter | Value |
251
+ |-------------------|----------|
252
+ | Temperature | 0.4 |
253
+ | Repetition Penalty| 1.1 |
254
+ | **Repetition Range** | **64** |
255
+ | Top-k | 40 |
256
+ | Top-p | 0.9 |
257
+ | Min-p | 0.05 |
258
+
259
+ ## Model Specifications
260
+
261
+ - **Training Data:** Trained on **~60k hours of audio**
262
+ - **Context Length:** Supports a maximum context window of **8,192 tokens**
263
+
264
+ ### Training Parameters
265
+
266
+ #### **Pre-Training**
267
+ - **Optimizer:** AdamW
268
+ - **Batch Size:** 1 million tokens
269
+ - **Max Learning Rate:** 3e-4
270
+ - **Min Learning Rate:** 3e-5
271
+ - **Context Length:** 8192
272
+
273
+ #### **Fine-Tuning**
274
+ - **Optimizer:** AdamW
275
+ - **Max Learning Rate:** 1e-5
276
+ - **Min Learning Rate:** 5e-6
277
+ - **Data:** 10,000 diverse, high-quality examples
278
+
279
+ ## License Information
280
+
281
+ - **Initial Llama3.2 Components:** [Llama 3.2 Community License Agreement ](https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/LICENSE.txt)
282
+ - **Our Continued Pre-Training, Fine-Tuning, and Additional Components:** [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)
283
+
284
+ ## Acknowledgments
285
+
286
+ - Big thanks to **Hugging Face** for their continued resource support through their grant program!
287
+ - Audio encoding and decoding utilize [ibm-research/DAC.speech.v1.0](https://huggingface.co/ibm-research/DAC.speech.v1.0)
288
+ - OuteTTS is built with [Llama3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) as the base model, with continued pre-training and fine-tuning.
289
+
290
+ ### Ethical Use Guidelines
291
+ This text-to-speech model is intended for legitimate applications that enhance accessibility, creativity, and communication;
292
+ prohibited uses include impersonation without consent, creation of deliberately misleading content,
293
+ generation of harmful or harassing material, distribution of synthetic audio without proper disclosure,
294
+ voice cloning without permission, and any uses that violate applicable laws, regulations, or copyrights.
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/genai_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 133309,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 64,
12
+ "hidden_size": 2048,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "position_ids": "position_ids",
17
+ "past_key_names": "past_key_values.%d.key",
18
+ "past_value_names": "past_key_values.%d.value"
19
+ },
20
+ "outputs": {
21
+ "logits": "logits",
22
+ "present_key_names": "present.%d.key",
23
+ "present_value_names": "present.%d.value"
24
+ },
25
+ "num_attention_heads": 32,
26
+ "num_hidden_layers": 16,
27
+ "num_key_value_heads": 8
28
+ },
29
+ "eos_token_id": 133310,
30
+ "pad_token_id": 128001,
31
+ "type": "llama",
32
+ "vocab_size": 134400
33
+ },
34
+ "search": {
35
+ "diversity_penalty": 0.0,
36
+ "do_sample": true,
37
+ "early_stopping": true,
38
+ "length_penalty": 1.0,
39
+ "max_length": 131072,
40
+ "min_length": 0,
41
+ "no_repeat_ngram_size": 0,
42
+ "num_beams": 1,
43
+ "num_return_sequences": 1,
44
+ "past_present_share_buffer": false,
45
+ "repetition_penalty": 1.1,
46
+ "temperature": 0.4,
47
+ "top_k": 1,
48
+ "top_p": 0.9
49
+ }
50
+ }
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee7e4bee1f8c88c27685a97b463bb19e23397f1528db305e828c823d38faa45
3
+ size 382529
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dade0f04f9868cbd28f343818019e71266b39c555b6144bf93b376ebafca074b
3
+ size 3064074240
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Prince-1/Llama-OuteTTS-1.0-1B
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec54a55e3c6fc7318ea02cbd1a6eb1fb180bff1b58acbf068504a25f1b407b7b
3
+ size 18366636
models/multi/Llama-OuteTTS-1.0-1B (Prince-1)/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi/Llama-OuteTTS-1.0-1B-ONNX/.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
37
+ onnx/model_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/multi/Llama-OuteTTS-1.0-1B-ONNX/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers.js
3
+ base_model: OuteAI/Llama-OuteTTS-1.0-1B
4
+ license: cc-by-nc-sa-4.0
5
+ language:
6
+ - en
7
+ - ar
8
+ - zh
9
+ - nl
10
+ - fr
11
+ - de
12
+ - it
13
+ - ja
14
+ - ko
15
+ - lt
16
+ - ru
17
+ - es
18
+ - pt
19
+ - be
20
+ - bn
21
+ - ka
22
+ - hu
23
+ - lv
24
+ - fa
25
+ - pl
26
+ - sw
27
+ - ta
28
+ - uk
29
+ pipeline_tag: text-to-speech
30
+ ---
31
+
32
+ https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B with ONNX weights to be compatible with Transformers.js.
33
+
34
+ Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
models/multi/Llama-OuteTTS-1.0-1B-ONNX/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "OuteAI/Llama-OuteTTS-1.0-1B",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 133309,
9
+ "eos_token_id": 133310,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 131072,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 16,
20
+ "num_key_value_heads": 8,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": {
24
+ "factor": 32.0,
25
+ "high_freq_factor": 4.0,
26
+ "low_freq_factor": 1.0,
27
+ "original_max_position_embeddings": 8192,
28
+ "rope_type": "llama3"
29
+ },
30
+ "rope_theta": 500000.0,
31
+ "tie_word_embeddings": true,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.49.0",
34
+ "use_cache": true,
35
+ "vocab_size": 134400,
36
+ "transformers.js_config": {
37
+ "kv_cache_dtype": {
38
+ "q4f16": "float16",
39
+ "fp16": "float16"
40
+ },
41
+ "use_external_data_format": {
42
+ "model.onnx": true,
43
+ "model_fp16.onnx": true
44
+ }
45
+ }
46
+ }
models/multi/Llama-OuteTTS-1.0-1B-ONNX/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 133309,
4
+ "do_sample": true,
5
+ "eos_token_id": 133310,
6
+ "min_p": 0.05,
7
+ "pad_token_id": 128001,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.4,
10
+ "top_k": 40,
11
+ "top_p": 0.9,
12
+ "transformers_version": "4.49.0"
13
+ }
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c3c81e57665ba6076771d67e0e03d8ef534e1fc48cf786f8c6d35d7e871bbeb
3
+ size 111218
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59969940d166ff1b9c603fce1fb3b1e8459e942fec724716fcb4a71f373d801
3
+ size 5027143680
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d0e2f236f33fcdc00d3eb808276d881c1508d884b8699cd88778f523a2c679
3
+ size 1682311029
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e0abbdfdbdac6f41c10f0e7cf2215704e0f5bb3ed5070205c046b3ba80478b
3
+ size 432256670
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_fp16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf0dc21d2e2abb03f7cace56f78c122208dcec2390f7ac63078c55fd5f5f277
3
+ size 2081423360
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fe625c4a905bbb67cffe16de38f65da1f34a251cedfd65288b3fb0539df44ef
3
+ size 1282385137
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcdb6e6fdae7ecca8b2167e487defc373578ffa7cce01e3e3e2599bd352e2cf
3
+ size 1743127573
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f14a2831ec98ad95b571c12e2db0f95d6cc1b372a5e4e50dfe08f10c6a9be0d
3
+ size 1114895330
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118212594d548fff284fa7bcd096065fc84549366f3c6bfcbaf40dd94c2cc2b9
3
+ size 1282385197
models/multi/Llama-OuteTTS-1.0-1B-ONNX/onnx/model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118212594d548fff284fa7bcd096065fc84549366f3c6bfcbaf40dd94c2cc2b9
3
+ size 1282385197
models/multi/Llama-OuteTTS-1.0-1B-ONNX/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B-ONNX
models/multi/Llama-OuteTTS-1.0-1B-ONNX/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi/Llama-OuteTTS-1.0-1B-ONNX/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f45db023112d30b02574448e2602ee20cc192e0038b5d5e98ab73e5792a0f66
3
+ size 12415714
models/multi/Llama-OuteTTS-1.0-1B-ONNX/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi/Llama-OuteTTS-1.0-1B/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/multi/Llama-OuteTTS-1.0-1B/README.md ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-sa-4.0
3
+ language:
4
+ - en
5
+ - ar
6
+ - zh
7
+ - nl
8
+ - fr
9
+ - de
10
+ - it
11
+ - ja
12
+ - ko
13
+ - lt
14
+ - ru
15
+ - es
16
+ - pt
17
+ - be
18
+ - bn
19
+ - ka
20
+ - hu
21
+ - lv
22
+ - fa
23
+ - pl
24
+ - sw
25
+ - ta
26
+ - uk
27
+ pipeline_tag: text-to-speech
28
+ library_name: outetts
29
+ ---
30
+ <div class="p-4 bg-gray-50 dark:bg-gray-800 rounded-lg shadow-sm mb-12">
31
+ <div class="text-center mb-4">
32
+ <h2 class="text-xl font-light text-gray-900 dark:text-white tracking-tight mt-0 mb-0">Oute A I</h2>
33
+ <div class="flex justify-center gap-6 mt-4">
34
+ <a href="https://www.outeai.com/" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
35
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
36
+ <circle cx="12" cy="12" r="10"></circle>
37
+ <path d="M2 12h20M12 2a15.3 15.3 0 0 1 4 10 15.3 15.3 0 0 1-4 10 15.3 15.3 0 0 1-4-10 15.3 15.3 0 0 1 4-10z"></path>
38
+ </svg>
39
+ outeai.com
40
+ </a>
41
+ <a href="https://discord.gg/vyBM87kAmf" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
42
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
43
+ <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
44
+ </svg>
45
+ Discord
46
+ </a>
47
+ <a href="https://x.com/OuteAI" target="_blank" class="flex items-center gap-1 text-gray-700 dark:text-gray-300 text-m font-medium hover:text-gray-900 dark:hover:text-white transition-colors underline">
48
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
49
+ <path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path>
50
+ </svg>
51
+ @OuteAI
52
+ </a>
53
+ </div>
54
+ </div>
55
+
56
+ <div class="grid grid-cols-3 sm:grid-cols-3 gap-2">
57
+ <a href="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
58
+ Llama OuteTTS 1.0 1B
59
+ </a>
60
+ <a href="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B-GGUF" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
61
+ Llama OuteTTS 1.0 1B GGUF
62
+ </a>
63
+ <a href="https://github.com/edwko/OuteTTS" target="_blank" class="bg-white dark:bg-gray-700 text-gray-800 dark:text-gray-100 text-sm font-medium py-2 px-3 rounded-md text-center hover:bg-gray-100 dark:hover:bg-gray-600 hover:border-gray-300 dark:hover:border-gray-500 border border-transparent transition-all">
64
+ GitHub Library
65
+ </a>
66
+ </div>
67
+ </div>
68
+
69
+ > [!IMPORTANT]
70
+ > **Important Sampling Considerations**
71
+ >
72
+ > When using OuteTTS version 1.0, it is crucial to use the settings specified in the [Sampling Configuration](#sampling-configuration) section.
73
+ > The **repetition penalty implementation** is particularly important - this model requires penalization applied to a **64-token recent window**,
74
+ > rather than across the entire context window. Penalizing the entire context will cause the model to produce **broken or low-quality output**.
75
+ >
76
+ > To address this limitation, all necessary samplers and patches for all backends are set up automatically in the **outetts** library.
77
+ > If using a custom implementation, ensure you correctly implement these requirements.
78
+
79
+ # OuteTTS Version 1.0
80
+
81
+ This update brings significant improvements in speech synthesis and voice cloning—delivering a more powerful, accurate, and user-friendly experience in a compact size.
82
+
83
+ ## What's New
84
+
85
+ ### 1. Prompt Revamp & Dependency Removal
86
+ - **Automatic Word Alignment:** The model now performs word alignment internally. Simply input raw text—no pre-processing required—and the model handles the rest, streamlining your workflow. For optimal results, use normalized, readable text without newlines (light normalization is applied automatically in outetts library).
87
+ - **Native Multilingual Text Support:** Direct support for native text across multiple languages eliminates the need for romanization.
88
+ - **Enhanced Metadata Integration:** The updated prompt system incorporates additional metadata (time, energy, spectral centroid, pitch) at both global and word levels, improving speaker flow and synthesis quality.
89
+ - **Special Tokens for Audio Codebooks:** New tokens for c1 (codebook 1) and c2 (codebook 2).
90
+
91
+ ### 2. New Audio Encoder Model
92
+ - **DAC Encoder:** Integrates a DAC audio encoder from [ibm-research/DAC.speech.v1.0](https://huggingface.co/ibm-research/DAC.speech.v1.0), utilizing two codebooks for high quality audio reconstruction.
93
+ - **Performance Trade-off:** Improved audio fidelity increases the token generation rate from 75 to 150 tokens per second. This trade-off prioritizes quality, especially for multilingual applications.
94
+
95
+ ### 3. Voice Cloning
96
+ - **One-Shot Voice Cloning:** To achieve one-shot cloning, the model typically requires only around **10 seconds** of reference audio to produce an accurate voice representation.
97
+ - **Improved Accuracy:** Enhanced by the new encoder and additional training metadata, voice cloning is now more natural and precise.
98
+
99
+ ### 4. Auto Text Alignment & Numerical Support
100
+ - **Automatic Text Alignment:** Aligns raw text at the word level, even for languages without clear boundaries (e.g., Japanese, Chinese), using insights from pre-processed training data.
101
+ - **Direct Numerical Input:** Built-in multilingual numerical support allows direct use of numbers in prompts—no textual conversion needed. (The model typically chooses the dominant language present. Mixing languages in a single prompt may lead to mistakes.)
102
+
103
+ ### 5. Multilingual Capabilities
104
+
105
+ - **Supported Languages:** OuteTTS offers varying proficiency levels across languages, based on training data exposure.
106
+
107
+ - **High Training Data Languages:** These languages feature extensive training: **English, Arabic, Chinese, Dutch, French, German, Italian, Japanese, Korean, Lithuanian, Russian, Spanish**
108
+
109
+ - **Moderate Training Data Languages:** These languages received moderate training, offering good performance with occasional limitations: **Portuguese, Belarusian, Bengali, Georgian, Hungarian, Latvian, Persian/Farsi, Polish, Swahili, Tamil, Ukrainian**
110
+
111
+ - **Beyond Supported Languages:** The model can generate speech in untrained languages with varying success. Experiment with unlisted languages, though results may not be optimal.
112
+
113
+ ## Video Showcase
114
+
115
+ <video width="1280" height="720" controls style="box-shadow: 0px 0px 20px 10px rgba(0, 0, 0, 0.05), 0px 1px 3px 10px rgba(255, 255, 255, 0.05);">
116
+ <source src="https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B-GGUF/resolve/main/media/showcase.mp4" type="video/mp4">
117
+ Your browser does not support the video tag.
118
+ </video>
119
+
120
+ ## Quick Start Guide
121
+
122
+ Getting started with **OuteTTS** is simple:
123
+
124
+ ### Installation
125
+
126
+ 🔗 [Installation instructions](https://github.com/edwko/OuteTTS?tab=readme-ov-file#installation)
127
+
128
+ ### Basic Usage
129
+ ```python
130
+ import outetts
131
+
132
+ # Initialize the interface
133
+ interface = outetts.Interface(
134
+ config=outetts.ModelConfig.auto_config(
135
+ model=outetts.Models.VERSION_1_0_SIZE_1B,
136
+ # For llama.cpp backend
137
+ backend=outetts.Backend.LLAMACPP,
138
+ quantization=outetts.LlamaCppQuantization.FP16
139
+ # For transformers backend
140
+ # backend=outetts.Backend.HF,
141
+ )
142
+ )
143
+
144
+ # Load the default speaker profile
145
+ speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL")
146
+
147
+ # Or create your own speaker profiles in seconds and reuse them instantly
148
+ # speaker = interface.create_speaker("path/to/audio.wav")
149
+ # interface.save_speaker(speaker, "speaker.json")
150
+ # speaker = interface.load_speaker("speaker.json")
151
+
152
+ # Generate speech
153
+ output = interface.generate(
154
+ config=outetts.GenerationConfig(
155
+ text="Hello, how are you doing?",
156
+ generation_type=outetts.GenerationType.CHUNKED,
157
+ speaker=speaker,
158
+ sampler_config=outetts.SamplerConfig(
159
+ temperature=0.4
160
+ ),
161
+ )
162
+ )
163
+
164
+ # Save to file
165
+ output.save("output.wav")
166
+ ```
167
+
168
+ ### More Configuration Options
169
+ For advanced settings and customization, visit the official repository:
170
+ 🔗 [interface_usage.md](https://github.com/edwko/OuteTTS/blob/main/docs/interface_usage.md)
171
+
172
+ ## Usage Recommendations
173
+
174
+ ### Speaker Reference
175
+ The model is designed to be used with a speaker reference. Without one, it generates random vocal characteristics, often leading to lower-quality outputs.
176
+ The model inherits the referenced speaker's emotion, style, and accent.
177
+ When transcribing to other languages with the same speaker, you may observe the model retaining the original accent.
178
+
179
+ ### Multilingual Application
180
+ It is recommended to create a speaker profile in the language you intend to use. This helps achieve the best results in that specific language, including tone, accent, and linguistic features.
181
+
182
+ While the model supports cross-lingual speech, it still relies on the reference speaker. If the speaker has a distinct accent—such as British English—other languages may carry that accent as well.
183
+
184
+ ### Optimal Audio Length
185
+ - **Best Performance:** Generate audio around **42 seconds** in a single run (approximately 8,192 tokens). It is recomended not to near the limits of this windows when generating. Usually, the best results are up to 7,000 tokens.
186
+ - **Context Reduction with Speaker Reference:** If the speaker reference is 10 seconds long, the effective context is reduced to approximately 32 seconds.
187
+
188
+ ### Temperature Setting Recommendations
189
+ Testing shows that a temperature of **0.4** is an ideal starting point for accuracy (with the sampling settings below). However, some voice references may benefit from higher temperatures for enhanced expressiveness or slightly lower temperatures for more precise voice replication.
190
+
191
+ ### Verifying Speaker Encoding
192
+ If the cloned voice quality is subpar, check the encoded speaker sample.
193
+
194
+ ```python
195
+ interface.decode_and_save_speaker(speaker=your_speaker, path="speaker.wav")
196
+ ```
197
+
198
+ The DAC audio reconstruction model is lossy, and samples with clipping, excessive loudness, or unusual vocal features may introduce encoding issues that impact output quality.
199
+
200
+ ### Sampling Configuration
201
+ For optimal results with this TTS model, use the following sampling settings.
202
+
203
+ | Parameter | Value |
204
+ |-------------------|----------|
205
+ | Temperature | 0.4 |
206
+ | Repetition Penalty| 1.1 |
207
+ | **Repetition Range** | **64** |
208
+ | Top-k | 40 |
209
+ | Top-p | 0.9 |
210
+ | Min-p | 0.05 |
211
+
212
+ ## Model Specifications
213
+
214
+ - **Training Data:** Trained on **~60k hours of audio**
215
+ - **Context Length:** Supports a maximum context window of **8,192 tokens**
216
+
217
+ ### Training Parameters
218
+
219
+ #### **Pre-Training**
220
+ - **Optimizer:** AdamW
221
+ - **Batch Size:** 1 million tokens
222
+ - **Max Learning Rate:** 3e-4
223
+ - **Min Learning Rate:** 3e-5
224
+ - **Context Length:** 8192
225
+
226
+ #### **Fine-Tuning**
227
+ - **Optimizer:** AdamW
228
+ - **Max Learning Rate:** 1e-5
229
+ - **Min Learning Rate:** 5e-6
230
+ - **Data:** 10,000 diverse, high-quality examples
231
+
232
+ ## License Information
233
+
234
+ - **Initial Llama3.2 Components:** [Llama 3.2 Community License Agreement ](https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/LICENSE.txt)
235
+ - **Our Continued Pre-Training, Fine-Tuning, and Additional Components:** [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)
236
+
237
+ ## Acknowledgments
238
+
239
+ - Big thanks to **Hugging Face** for their continued resource support through their grant program!
240
+ - Audio encoding and decoding utilize [ibm-research/DAC.speech.v1.0](https://huggingface.co/ibm-research/DAC.speech.v1.0)
241
+ - OuteTTS is built with [Llama3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) as the base model, with continued pre-training and fine-tuning.
242
+
243
+ ### Ethical Use Guidelines
244
+ This text-to-speech model is intended for legitimate applications that enhance accessibility, creativity, and communication;
245
+ prohibited uses include impersonation without consent, creation of deliberately misleading content,
246
+ generation of harmful or harassing material, distribution of synthetic audio without proper disclosure,
247
+ voice cloning without permission, and any uses that violate applicable laws, regulations, or copyrights.
models/multi/Llama-OuteTTS-1.0-1B/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 133309,
8
+ "eos_token_id": 133310,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8192,
14
+ "max_position_embeddings": 131072,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 16,
19
+ "num_key_value_heads": 8,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": {
23
+ "factor": 32.0,
24
+ "high_freq_factor": 4.0,
25
+ "low_freq_factor": 1.0,
26
+ "original_max_position_embeddings": 8192,
27
+ "rope_type": "llama3"
28
+ },
29
+ "rope_theta": 500000.0,
30
+ "tie_word_embeddings": true,
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.48.3",
33
+ "use_cache": true,
34
+ "vocab_size": 134400
35
+ }
models/multi/Llama-OuteTTS-1.0-1B/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 133309,
4
+ "eos_token_id": 133310,
5
+ "pad_token_id": 128001,
6
+ "transformers_version": "4.48.3",
7
+ "do_sample": true,
8
+ "temperature": 0.4,
9
+ "repetition_penalty": 1.1,
10
+ "top_k": 40,
11
+ "top_p": 0.9,
12
+ "min_p": 0.05
13
+ }
models/multi/Llama-OuteTTS-1.0-1B/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03543805868adcbc81a2259177d626f557cfa8be6fa19480a47ba30a920b8003
3
+ size 2496811440
models/multi/Llama-OuteTTS-1.0-1B/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B
models/multi/Llama-OuteTTS-1.0-1B/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi/Llama-OuteTTS-1.0-1B/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec54a55e3c6fc7318ea02cbd1a6eb1fb180bff1b58acbf068504a25f1b407b7b
3
+ size 18366636
models/multi/Llama-OuteTTS-1.0-1B/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff