Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +103 -3
config.json +16 -0
dep_former_csm.safetensors +3 -0
model.safetensors +3 -0
phoneme_to_token.json +125 -0

README.md CHANGED Viewed

@@ -1,3 +1,103 @@
----
-license: cc-by-4.0
----

+---
+license: cc-by-4.0
+datasets:
+- amphion/Emilia-Dataset
+- nvidia/hifitts-2
+language:
+- en
+pipeline_tag: text-to-speech
+tags:
+- text-to-speech
+---
+# Model Card for VoXtream2
+VoXtream2 is a zero-shot full-stream TTS model with dynamic speaking-rate control that can be updated mid-utterance on the fly.
+### Key features
+- **Dynamic speed control**: Distribution matching and Classifier-free guidance allow for a fine-grained speaking rate control, which can be adjusted as the model generates speech.
+- **Streaming performance**: Works **4x** times faster than real-time and achieves **74 ms** first packet latency in a full-stream on a consumer GPU.
+- **Translingual capability**: Prompt text masking enables support of acoustic prompts in any language.
+### Model Sources
+- **Repository:** [repo](https://github.com/herimor/voxtream)
+- **Paper:** [paper](https://arxiv.org/pdf/2603.13518)
+- **Demo Page:** [demo page](https://herimor.github.io/voxtream2)
+- **Live Demo:** [live demo](https://huggingface.co/spaces/herimor/voxtream2)
+## Get started
+### Installation
+### eSpeak NG phonemizer
+```bash
+# For Debian-like distribution (e.g. Ubuntu, Mint, etc.)
+apt-get install espeak-ng
+# For RedHat-like distribution (e.g. CentOS, Fedora, etc.)
+yum install espeak-ng
+# For MacOS
+brew install espeak-ng
+```
+### Pip package
+```bash
+pip install "voxtream>=0.2"
+```
+### Usage
+* Prompt audio: a file containing 3-10 seconds of the target voice. The maximum supported length is 20 seconds (longer audio will be trimmed).
+* Text: What you want the model to say. The maximum supported length is 1000 characters (longer text will be trimmed).
+* Speaking rate (optional): target speaking rate in syllables per second.
+#### Output streaming
+```bash
+voxtream \
+    --prompt-audio assets/audio/english_male.wav \
+    --text "In general, however, some method is then needed to evaluate each approximation." \
+    --output "output_stream.wav"
+```
+#### Full streaming (slow speech, 2 syllables per second)
+```bash
+voxtream \
+    --prompt-audio assets/audio/english_female.wav \
+    --text "Staff do not always do enough to prevent violence." \
+    --output "full_stream_2sps.wav" \
+    --full-stream \
+    --spk-rate 2.0
+```
+* Note: Initial run may take some time to download model weights and warmup model graph.
+### Out-of-Scope Use
+Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
+## Training Data
+The model was trained on [Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset) and [HiFiTTS2](https://huggingface.co/datasets/nvidia/hifitts-2) datasets. You can download preprocessed dataset [here](https://huggingface.co/datasets/herimor/voxtream2-train). For more details, please check our paper.
+## Citation
+```
+@inproceedings{torgashov2026voxtream,
+  title={Vo{X}tream: Full-Stream Text-to-Speech with Extremely Low Latency},
+  author={Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
+  booktitle={Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  year={2026},
+  note={to appear},
+  url={https://arxiv.org/abs/2509.15969}
+}
+@article{torgashov2026voxtream2,
+  author    = {Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
+  title     = {Vo{X}tream2: Full-stream TTS with dynamic speaking rate control},
+  journal   = {arXiv:2603.13518},
+  year      = {2026}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "phone_former": "phone_former",
+    "temp_former": "temp_former",
+    "dep_former": "dep_former_csm",
+    "phone_vocab_size": 125,
+    "audio_vocab_size": 2050,
+    "audio_pad_size": 0,
+    "embedding_dim": 1024,
+    "spk_embedding_dim": 192,
+    "num_codebooks": 16,
+    "num_phone_states": 6,
+    "amortization_divisor": 16,
+    "max_look_ahead": 5,
+    "audio_window_size": 625,
+    "phone_window_size": 625
+}

dep_former_csm.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e80e66f39cb010de18763721eaa9523f07827ccf21dd7b8a1486d2abc4bc89
+size 704938152

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0761a350f9908227dcdce4556328a5896d1bab9d609939869ea941f206febb5
+size 1851507776

phoneme_to_token.json ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+    "a\u026a": 0,
+    "a\u026a\u0259": 1,
+    "a\u026a\u025a": 2,
+    "a\u028a": 3,
+    "b": 4,
+    "d": 5,
+    "d\u0292": 6,
+    "e\u026a": 7,
+    "f": 8,
+    "h": 9,
+    "i": 10,
+    "i\u0259": 11,
+    "i\u02d0": 12,
+    "j": 13,
+    "k": 14,
+    "l": 15,
+    "m": 16,
+    "n": 17,
+    "n\u0329": 18,
+    "o\u028a": 19,
+    "o\u02d0": 20,
+    "o\u02d0\u0279": 21,
+    "p": 22,
+    "r": 23,
+    "s": 24,
+    "t": 25,
+    "t\u0283": 26,
+    "u\u02d0": 27,
+    "v": 28,
+    "w": 29,
+    "x": 30,
+    "z": 31,
+    "\u00e6": 32,
+    "\u00f0": 33,
+    "\u014b": 34,
+    "\u0250": 35,
+    "\u0251\u02d0": 36,
+    "\u0251\u02d0\u0279": 37,
+    "\u0254": 38,
+    "\u0254\u026a": 39,
+    "\u0254\u02d0": 40,
+    "\u0254\u02d0\u0279": 41,
+    "\u0259": 42,
+    "\u0259l": 43,
+    "\u025a": 44,
+    "\u025b": 45,
+    "\u025b\u0279": 46,
+    "\u025c\u02d0": 47,
+    "\u0261": 48,
+    "\u026a": 49,
+    "\u026a\u0279": 50,
+    "\u0279": 51,
+    "\u027e": 52,
+    "\u0283": 53,
+    "\u028a": 54,
+    "\u028a\u0279": 55,
+    "\u028c": 56,
+    "\u0292": 57,
+    "\u0294": 58,
+    "\u02c8a\u026a": 59,
+    "\u02c8a\u026a\u0259": 60,
+    "\u02c8a\u026a\u025a": 61,
+    "\u02c8a\u028a": 62,
+    "\u02c8e\u026a": 63,
+    "\u02c8i\u0259": 64,
+    "\u02c8i\u02d0": 65,
+    "\u02c8o\u028a": 66,
+    "\u02c8o\u02d0": 67,
+    "\u02c8o\u02d0\u0279": 68,
+    "\u02c8u\u02d0": 69,
+    "\u02c8\u00e6": 70,
+    "\u02c8\u0251\u02d0": 71,
+    "\u02c8\u0251\u02d0\u0279": 72,
+    "\u02c8\u0254": 73,
+    "\u02c8\u0254\u026a": 74,
+    "\u02c8\u0254\u02d0": 75,
+    "\u02c8\u0254\u02d0\u0279": 76,
+    "\u02c8\u0259": 77,
+    "\u02c8\u025a": 78,
+    "\u02c8\u025b": 79,
+    "\u02c8\u025b\u0279": 80,
+    "\u02c8\u025b\u02d0": 81,
+    "\u02c8\u025c\u02d0": 82,
+    "\u02c8\u026a": 83,
+    "\u02c8\u026a\u0279": 84,
+    "\u02c8\u028a": 85,
+    "\u02c8\u028a\u0279": 86,
+    "\u02c8\u028c": 87,
+    "\u02cca\u026a": 88,
+    "\u02cca\u026a\u025a": 89,
+    "\u02cca\u028a": 90,
+    "\u02cce\u026a": 91,
+    "\u02cci\u0259": 92,
+    "\u02cci\u02d0": 93,
+    "\u02cco\u028a": 94,
+    "\u02cco\u02d0": 95,
+    "\u02cco\u02d0\u0279": 96,
+    "\u02ccu\u02d0": 97,
+    "\u02cc\u00e6": 98,
+    "\u02cc\u0250": 99,
+    "\u02cc\u0251\u02d0": 100,
+    "\u02cc\u0251\u02d0\u0279": 101,
+    "\u02cc\u0254": 102,
+    "\u02cc\u0254\u026a": 103,
+    "\u02cc\u0254\u02d0": 104,
+    "\u02cc\u0254\u02d0\u0279": 105,
+    "\u02cc\u0259": 106,
+    "\u02cc\u025b": 107,
+    "\u02cc\u025b\u0279": 108,
+    "\u02cc\u025c\u02d0": 109,
+    "\u02cc\u026a": 110,
+    "\u02cc\u026a\u0279": 111,
+    "\u02cc\u028a": 112,
+    "\u02cc\u028a\u0279": 113,
+    "\u02cc\u028c": 114,
+    "\u03b8": 115,
+    "\u1d7b": 116,
+    ".": 117,
+    ",": 118,
+    "?": 119,
+    "sil": 120,
+    "!": 121,
+    "unk": 122
+}