ChaitanyaChandra commited on
Commit
c75822a
·
1 Parent(s): 69c8f4a

Deploy VibeVoice to Spaces (runtime model + LFS presets/images)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -34
  2. .gitignore +181 -0
  3. Dockerfile +1 -1
  4. Figures/MOS-preference.png +3 -0
  5. Figures/VibeVoice.jpg +3 -0
  6. Figures/VibeVoice_Realtime.png +3 -0
  7. Figures/VibeVoice_logo.png +3 -0
  8. Figures/VibeVoice_logo_white.png +3 -0
  9. README.md +123 -12
  10. demo/code.sh +1 -0
  11. demo/realtime_model_inference_from_file.py +314 -0
  12. demo/text_examples/1p_abs.txt +2 -0
  13. demo/text_examples/1p_vibevoice.txt +1 -0
  14. demo/vibevoice_realtime_colab.ipynb +198 -0
  15. demo/vibevoice_realtime_demo.py +17 -0
  16. demo/voices/streaming_model/de-Spk0_man.pt +3 -0
  17. demo/voices/streaming_model/de-Spk1_woman.pt +3 -0
  18. demo/voices/streaming_model/en-Carter_man.pt +3 -0
  19. demo/voices/streaming_model/en-Davis_man.pt +3 -0
  20. demo/voices/streaming_model/en-Emma_woman.pt +3 -0
  21. demo/voices/streaming_model/en-Frank_man.pt +3 -0
  22. demo/voices/streaming_model/en-Grace_woman.pt +3 -0
  23. demo/voices/streaming_model/en-Mike_man.pt +3 -0
  24. demo/voices/streaming_model/fr-Spk0_man.pt +3 -0
  25. demo/voices/streaming_model/fr-Spk1_woman.pt +3 -0
  26. demo/voices/streaming_model/in-Samuel_man.pt +3 -0
  27. demo/voices/streaming_model/it-Spk0_woman.pt +3 -0
  28. demo/voices/streaming_model/it-Spk1_man.pt +3 -0
  29. demo/voices/streaming_model/jp-Spk0_man.pt +3 -0
  30. demo/voices/streaming_model/jp-Spk1_woman.pt +3 -0
  31. demo/voices/streaming_model/kr-Spk0_woman.pt +3 -0
  32. demo/voices/streaming_model/kr-Spk1_man.pt +3 -0
  33. demo/voices/streaming_model/nl-Spk0_man.pt +3 -0
  34. demo/voices/streaming_model/nl-Spk1_woman.pt +3 -0
  35. demo/voices/streaming_model/pl-Spk0_man.pt +3 -0
  36. demo/voices/streaming_model/pl-Spk1_woman.pt +3 -0
  37. demo/voices/streaming_model/pt-Spk0_woman.pt +3 -0
  38. demo/voices/streaming_model/pt-Spk1_man.pt +3 -0
  39. demo/voices/streaming_model/sp-Spk0_woman.pt +3 -0
  40. demo/voices/streaming_model/sp-Spk1_man.pt +3 -0
  41. demo/web/app.py +507 -0
  42. demo/web/index.html +1017 -0
  43. docs/vibevoice-realtime-0.5b.md +139 -0
  44. vibevoice/__init__.py +16 -0
  45. vibevoice/configs/qwen2.5_1.5b_64k.json +112 -0
  46. vibevoice/configs/qwen2.5_7b_32k.json +113 -0
  47. vibevoice/modular/__init__.py +14 -0
  48. vibevoice/modular/configuration_vibevoice.py +248 -0
  49. vibevoice/modular/configuration_vibevoice_streaming.py +85 -0
  50. vibevoice/modular/modeling_vibevoice_streaming.py +190 -0
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.pt filter=lfs diff=lfs merge=lfs -text
3
+ *.jpg filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initially taken from Github's Python gitignore file
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # tests and logs
12
+ tests/fixtures/cached_*_text.txt
13
+ logs/
14
+ lightning_logs/
15
+ lang_code_data/
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # celery beat schedule file
92
+ celerybeat-schedule
93
+
94
+ # SageMath parsed files
95
+ *.sage.py
96
+
97
+ # Environments
98
+ .env
99
+ .venv
100
+ env/
101
+ venv/
102
+ ENV/
103
+ env.bak/
104
+ venv.bak/
105
+
106
+ # Spyder project settings
107
+ .spyderproject
108
+ .spyproject
109
+
110
+ # Rope project settings
111
+ .ropeproject
112
+
113
+ # mkdocs documentation
114
+ /site
115
+
116
+ # mypy
117
+ .mypy_cache/
118
+ .dmypy.json
119
+ dmypy.json
120
+
121
+ # Pyre type checker
122
+ .pyre/
123
+
124
+ # vscode
125
+ .vs
126
+ .vscode
127
+
128
+ # Pycharm
129
+ .idea
130
+
131
+ # TF code
132
+ tensorflow_code
133
+
134
+ # Models
135
+ proc_data
136
+
137
+ # examples
138
+ runs
139
+ /runs_old
140
+ /wandb
141
+ /examples/runs
142
+ /examples/**/*.args
143
+ /examples/rag/sweep
144
+
145
+ # data
146
+ /data
147
+ serialization_dir
148
+
149
+ # emacs
150
+ *.*~
151
+ debug.env
152
+
153
+ # vim
154
+ .*.swp
155
+
156
+ #ctags
157
+ tags
158
+
159
+ # pre-commit
160
+ .pre-commit*
161
+
162
+ # .lock
163
+ *.lock
164
+
165
+ # DS_Store (MacOS)
166
+ .DS_Store
167
+
168
+ # ruff
169
+ .ruff_cache
170
+
171
+ # our proj
172
+ /output/
173
+ /outputs/
174
+ /checkpoint/
175
+ /checkpoints/
176
+ exp
177
+ .gradio/
178
+
179
+ # Ignored large models
180
+ demo/models/VibeVoice-Realtime-0.5B
181
+ *.safetensors
Dockerfile CHANGED
@@ -14,4 +14,4 @@ COPY . .
14
  RUN pip install -e .
15
  EXPOSE 7860
16
  WORKDIR /app/demo
17
- CMD ["python3", "vibevoice_realtime_demo.py", "--model_path", "../models/VibeVoice-Realtime-0.5B", "--port", "7860", "--device", "cpu"]
 
14
  RUN pip install -e .
15
  EXPOSE 7860
16
  WORKDIR /app/demo
17
+ CMD ["python3", "vibevoice_realtime_demo.py", "--model_path", "microsoft/VibeVoice-Realtime-0.5B", "--port", "7860", "--device", "cpu"]
Figures/MOS-preference.png ADDED

Git LFS Details

  • SHA256: 1bae2db570246512bbf162aa3fb9fd3b3c80d17c89f917fe133b5649d4fb1857
  • Pointer size: 130 Bytes
  • Size of remote file: 67.2 kB
Figures/VibeVoice.jpg ADDED

Git LFS Details

  • SHA256: 353803ce2be393700ff3dfedd0a522b88ebd294702d0d2f51b6f7b7fe65d344f
  • Pointer size: 131 Bytes
  • Size of remote file: 342 kB
Figures/VibeVoice_Realtime.png ADDED

Git LFS Details

  • SHA256: 0386a7f577a66324c2b07cf3dff573bc805ce8687c8d6f8b5f3d6d04aed51250
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
Figures/VibeVoice_logo.png ADDED

Git LFS Details

  • SHA256: c39206a2524b48f0413a54ac5e6d668d52ef22c4f5f1d57386d785ccb27a3f1d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.42 MB
Figures/VibeVoice_logo_white.png ADDED

Git LFS Details

  • SHA256: fc14f811c968062cf6a624b12043cf76b13c89597a240e78db08031c9e5a42ba
  • Pointer size: 131 Bytes
  • Size of remote file: 318 kB
README.md CHANGED
@@ -1,12 +1,123 @@
1
- ---
2
- title: VibeVoice
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: microsoft VibeVoice-Realtime-0.5B
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ ## 🎙️ VibeVoice: Open-Source Frontier Voice AI
4
+ [![Project Page](https://img.shields.io/badge/Project-Page-blue?logo=microsoft)](https://microsoft.github.io/VibeVoice)
5
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Collection-orange?logo=huggingface)](https://huggingface.co/collections/microsoft/vibevoice-68a2ef24a875c44be47b034f)
6
+ [![Technical Report](https://img.shields.io/badge/Technical-Report-red?logo=adobeacrobatreader)](https://arxiv.org/pdf/2508.19205)
7
+
8
+
9
+ </div>
10
+
11
+
12
+ <div align="center">
13
+ <picture>
14
+ <source media="(prefers-color-scheme: dark)" srcset="Figures/VibeVoice_logo_white.png">
15
+ <img src="Figures/VibeVoice_logo.png" alt="VibeVoice Logo" width="300">
16
+ </picture>
17
+ </div>
18
+
19
+ <div align="left">
20
+
21
+ <h3>📰 News</h3>
22
+
23
+ <img src="https://img.shields.io/badge/Status-New-brightgreen?style=flat" alt="New" />
24
+ <img src="https://img.shields.io/badge/Feature-Realtime_TTS-blue?style=flat&logo=soundcharts" alt="Realtime TTS" />
25
+
26
+ <strong>2025-12-03: 📣 We open-sourced <a href="docs/vibevoice-realtime-0.5b.md"><strong>VibeVoice‑Realtime‑0.5B</strong></a>, a real‑time text‑to‑speech model that supports streaming text input and robust long-form speech generation. Try it on [Colab](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb).</strong>
27
+
28
+ <strong>2025-12-09: 📣 We’ve added experimental speakers in nine languages (DE, FR, IT, JP, KR, NL, PL, PT, ES) for exploration—welcome to try them out and share your feedback.</strong>
29
+
30
+ To mitigate deepfake risks and ensure low latency for the first speech chunk, voice prompts are provided in an embedded format. For users requiring voice customization, please reach out to our team. We will also be expanding the range of available speakers.
31
+ <br>
32
+
33
+ https://github.com/user-attachments/assets/0901d274-f6ae-46ef-a0fd-3c4fba4f76dc
34
+
35
+ > (Launch your own realtime demo via the websocket example in [Usage](docs/vibevoice-realtime-0.5b.md#usage-1-launch-real-time-websocket-demo)).
36
+
37
+ </div>
38
+
39
+ 2025-09-05: VibeVoice is an open-source research framework intended to advance collaboration in the speech synthesis community. After release, we discovered instances where the tool was used in ways inconsistent with the stated intent. Since responsible use of AI is one of Microsoft’s guiding principles, we have disabled this repo until we are confident that out-of-scope use is no longer possible.
40
+
41
+
42
+ ### Overview
43
+
44
+ VibeVoice is a novel framework designed for generating **expressive**, **long-form**, **multi-speaker** conversational audio, such as podcasts, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly in scalability, speaker consistency, and natural turn-taking.
45
+
46
+ VibeVoice currently includes two model variants:
47
+
48
+ - **Long-form multi-speaker model**: Synthesizes conversational/single-speaker speech up to **90 minutes** with up to **4 distinct speakers**, surpassing the typical 1–2 speaker limits of many prior models.
49
+ - **[Realtime streaming TTS model](docs/vibevoice-realtime-0.5b.md)**: Produces initial audible speech in ~**300 ms** and supports **streaming text input** for single-speaker **real-time** speech generation; designed for low-latency generation.
50
+
51
+ A core innovation of VibeVoice is its use of continuous speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice employs a [next-token diffusion](https://arxiv.org/abs/2412.08635) framework, leveraging a Large Language Model (LLM) to understand textual context and dialogue flow, and a diffusion head to generate high-fidelity acoustic details.
52
+
53
+
54
+ <p align="left">
55
+ <img src="Figures/MOS-preference.png" alt="MOS Preference Results" height="260px">
56
+ <img src="Figures/VibeVoice.jpg" alt="VibeVoice Overview" height="250px" style="margin-right: 10px;">
57
+ </p>
58
+
59
+
60
+ ### 🎵 Demo Examples
61
+
62
+
63
+ **Video Demo**
64
+
65
+ We produced this video with [Wan2.2](https://github.com/Wan-Video/Wan2.2). We sincerely appreciate the Wan-Video team for their great work.
66
+
67
+ **English**
68
+ <div align="center">
69
+
70
+ https://github.com/user-attachments/assets/0967027c-141e-4909-bec8-091558b1b784
71
+
72
+ </div>
73
+
74
+
75
+ **Chinese**
76
+ <div align="center">
77
+
78
+ https://github.com/user-attachments/assets/322280b7-3093-4c67-86e3-10be4746c88f
79
+
80
+ </div>
81
+
82
+ **Cross-Lingual**
83
+ <div align="center">
84
+
85
+ https://github.com/user-attachments/assets/838d8ad9-a201-4dde-bb45-8cd3f59ce722
86
+
87
+ </div>
88
+
89
+ **Spontaneous Singing**
90
+ <div align="center">
91
+
92
+ https://github.com/user-attachments/assets/6f27a8a5-0c60-4f57-87f3-7dea2e11c730
93
+
94
+ </div>
95
+
96
+
97
+ **Long Conversation with 4 people**
98
+ <div align="center">
99
+
100
+ https://github.com/user-attachments/assets/a357c4b6-9768-495c-a576-1618f6275727
101
+
102
+ </div>
103
+
104
+ For more examples, see the [Project Page](https://microsoft.github.io/VibeVoice).
105
+
106
+
107
+
108
+ ## Risks and limitations
109
+
110
+ While efforts have been made to optimize it through various techniques, it may still produce outputs that are unexpected, biased, or inaccurate. VibeVoice inherits any biases, errors, or omissions produced by its base model (specifically, Qwen2.5 1.5b in this release).
111
+ Potential for Deepfakes and Disinformation: High-quality synthetic speech can be misused to create convincing fake audio content for impersonation, fraud, or spreading disinformation. Users must ensure transcripts are reliable, check content accuracy, and avoid using generated content in misleading ways. Users are expected to use the generated content and to deploy the models in a lawful manner, in full compliance with all applicable laws and regulations in the relevant jurisdictions. It is best practice to disclose the use of AI when sharing AI-generated content.
112
+
113
+ English and Chinese only: Transcripts in languages other than English or Chinese may result in unexpected audio outputs.
114
+
115
+ Non-Speech Audio: The model focuses solely on speech synthesis and does not handle background noise, music, or other sound effects.
116
+
117
+ Overlapping Speech: The current model does not explicitly model or generate overlapping speech segments in conversations.
118
+
119
+ We do not recommend using VibeVoice in commercial or real-world applications without further testing and development. This model is intended for research and development purposes only. Please use responsibly.
120
+
121
+ ## Star History
122
+
123
+ ![Star History Chart](https://api.star-history.com/svg?repos=Microsoft/vibevoice&type=date&legend=top-left)
demo/code.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python3 vibevoice_realtime_demo.py --model_path models/VibeVoice-Realtime-0.5B --port 8000 --device mps
demo/realtime_model_inference_from_file.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import re
4
+ import traceback
5
+ from typing import List, Tuple, Union, Dict, Any
6
+ import time
7
+ import torch
8
+ import copy
9
+
10
+ from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
11
+ from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
12
+ from transformers.utils import logging
13
+
14
+ logging.set_verbosity_info()
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class VoiceMapper:
19
+ """Maps speaker names to voice file paths"""
20
+
21
+ def __init__(self):
22
+ self.setup_voice_presets()
23
+
24
+ # change name according to our preset voice file
25
+ new_dict = {}
26
+ for name, path in self.voice_presets.items():
27
+
28
+ if '_' in name:
29
+ name = name.split('_')[0]
30
+
31
+ if '-' in name:
32
+ name = name.split('-')[-1]
33
+
34
+ new_dict[name] = path
35
+ self.voice_presets.update(new_dict)
36
+ # print(list(self.voice_presets.keys()))
37
+
38
+ def setup_voice_presets(self):
39
+ """Setup voice presets by scanning the voices directory."""
40
+ voices_dir = os.path.join(os.path.dirname(__file__), "voices/streaming_model")
41
+
42
+ # Check if voices directory exists
43
+ if not os.path.exists(voices_dir):
44
+ print(f"Warning: Voices directory not found at {voices_dir}")
45
+ self.voice_presets = {}
46
+ self.available_voices = {}
47
+ return
48
+
49
+ # Scan for all VOICE files in the voices directory
50
+ self.voice_presets = {}
51
+
52
+ # Get all .pt files in the voices directory
53
+ pt_files = [f for f in os.listdir(voices_dir)
54
+ if f.lower().endswith('.pt') and os.path.isfile(os.path.join(voices_dir, f))]
55
+
56
+ # Create dictionary with filename (without extension) as key
57
+ for pt_file in pt_files:
58
+ # Remove .pt extension to get the name
59
+ name = os.path.splitext(pt_file)[0]
60
+ # Create full path
61
+ full_path = os.path.join(voices_dir, pt_file)
62
+ self.voice_presets[name] = full_path
63
+
64
+ # Sort the voice presets alphabetically by name for better UI
65
+ self.voice_presets = dict(sorted(self.voice_presets.items()))
66
+
67
+ # Filter out voices that don't exist (this is now redundant but kept for safety)
68
+ self.available_voices = {
69
+ name: path for name, path in self.voice_presets.items()
70
+ if os.path.exists(path)
71
+ }
72
+
73
+ print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
74
+ print(f"Available voices: {', '.join(self.available_voices.keys())}")
75
+
76
+ def get_voice_path(self, speaker_name: str) -> str:
77
+ """Get voice file path for a given speaker name"""
78
+ # First try exact match
79
+ if speaker_name in self.voice_presets:
80
+ return self.voice_presets[speaker_name]
81
+
82
+ # Try partial matching (case insensitive)
83
+ speaker_lower = speaker_name.lower()
84
+ for preset_name, path in self.voice_presets.items():
85
+ if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
86
+ return path
87
+
88
+ # Default to first voice if no match found
89
+ default_voice = list(self.voice_presets.values())[0]
90
+ print(f"Warning: No voice preset found for '{speaker_name}', using default voice: {default_voice}")
91
+ return default_voice
92
+
93
+
94
+ def parse_args():
95
+ parser = argparse.ArgumentParser(description="VibeVoiceStreaming Processor TXT Input Test")
96
+ parser.add_argument(
97
+ "--model_path",
98
+ type=str,
99
+ default="microsoft/VibeVoice-Realtime-0.5B",
100
+ help="Path to the HuggingFace model directory",
101
+ )
102
+ parser.add_argument(
103
+ "--txt_path",
104
+ type=str,
105
+ default="demo/text_examples/1p_vibevoice.txt",
106
+ help="Path to the txt file containing the script",
107
+ )
108
+ parser.add_argument(
109
+ "--speaker_name",
110
+ type=str,
111
+ default="Wayne",
112
+ help="Single speaker name (e.g., --speaker_name Wayne)",
113
+ )
114
+ parser.add_argument(
115
+ "--output_dir",
116
+ type=str,
117
+ default="./outputs",
118
+ help="Directory to save output audio files",
119
+ )
120
+ parser.add_argument(
121
+ "--device",
122
+ type=str,
123
+ default=("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")),
124
+ help="Device for inference: cuda | mps | cpu",
125
+ )
126
+ parser.add_argument(
127
+ "--cfg_scale",
128
+ type=float,
129
+ default=1.5,
130
+ help="CFG (Classifier-Free Guidance) scale for generation (default: 1.5)",
131
+ )
132
+
133
+ return parser.parse_args()
134
+
135
+ def main():
136
+ args = parse_args()
137
+
138
+ # Normalize potential 'mpx' typo to 'mps'
139
+ if args.device.lower() == "mpx":
140
+ print("Note: device 'mpx' detected, treating it as 'mps'.")
141
+ args.device = "mps"
142
+
143
+ # Validate mps availability if requested
144
+ if args.device == "mps" and not torch.backends.mps.is_available():
145
+ print("Warning: MPS not available. Falling back to CPU.")
146
+ args.device = "cpu"
147
+
148
+ print(f"Using device: {args.device}")
149
+
150
+ # Initialize voice mapper
151
+ voice_mapper = VoiceMapper()
152
+
153
+ # Check if txt file exists
154
+ if not os.path.exists(args.txt_path):
155
+ print(f"Error: txt file not found: {args.txt_path}")
156
+ return
157
+
158
+ # Read and parse txt file
159
+ print(f"Reading script from: {args.txt_path}")
160
+ with open(args.txt_path, 'r', encoding='utf-8') as f:
161
+ scripts = f.read().strip()
162
+
163
+ if not scripts:
164
+ print("Error: No valid scripts found in the txt file")
165
+ return
166
+
167
+ full_script = scripts.replace("’", "'").replace('“', '"').replace('”', '"')
168
+
169
+ print(f"Loading processor & model from {args.model_path}")
170
+ processor = VibeVoiceStreamingProcessor.from_pretrained(args.model_path)
171
+
172
+ # Decide dtype & attention implementation
173
+ if args.device == "mps":
174
+ load_dtype = torch.float32 # MPS requires float32
175
+ attn_impl_primary = "sdpa" # flash_attention_2 not supported on MPS
176
+ elif args.device == "cuda":
177
+ load_dtype = torch.bfloat16
178
+ attn_impl_primary = "flash_attention_2"
179
+ else: # cpu
180
+ load_dtype = torch.float32
181
+ attn_impl_primary = "sdpa"
182
+ print(f"Using device: {args.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
183
+ # Load model with device-specific logic
184
+ try:
185
+ if args.device == "mps":
186
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
187
+ args.model_path,
188
+ torch_dtype=load_dtype,
189
+ attn_implementation=attn_impl_primary,
190
+ device_map=None, # load then move
191
+ )
192
+ model.to("mps")
193
+ elif args.device == "cuda":
194
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
195
+ args.model_path,
196
+ torch_dtype=load_dtype,
197
+ device_map="cuda",
198
+ attn_implementation=attn_impl_primary,
199
+ )
200
+ else: # cpu
201
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
202
+ args.model_path,
203
+ torch_dtype=load_dtype,
204
+ device_map="cpu",
205
+ attn_implementation=attn_impl_primary,
206
+ )
207
+ except Exception as e:
208
+ if attn_impl_primary == 'flash_attention_2':
209
+ print(f"[ERROR] : {type(e).__name__}: {e}")
210
+ print(traceback.format_exc())
211
+ print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
212
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
213
+ args.model_path,
214
+ torch_dtype=load_dtype,
215
+ device_map=(args.device if args.device in ("cuda", "cpu") else None),
216
+ attn_implementation='sdpa'
217
+ )
218
+ if args.device == "mps":
219
+ model.to("mps")
220
+ else:
221
+ raise e
222
+
223
+
224
+ model.eval()
225
+ model.set_ddpm_inference_steps(num_steps=5)
226
+
227
+ if hasattr(model.model, 'language_model'):
228
+ print(f"Language model attention: {model.model.language_model.config._attn_implementation}")
229
+
230
+ target_device = args.device if args.device != "cpu" else "cpu"
231
+ voice_sample = voice_mapper.get_voice_path(args.speaker_name)
232
+ all_prefilled_outputs = torch.load(voice_sample, map_location=target_device, weights_only=False)
233
+
234
+ # Prepare inputs for the model
235
+ inputs = processor.process_input_with_cached_prompt(
236
+ text=full_script,
237
+ cached_prompt=all_prefilled_outputs,
238
+ padding=True,
239
+ return_tensors="pt",
240
+ return_attention_mask=True,
241
+ )
242
+
243
+ # Move tensors to target device
244
+ for k, v in inputs.items():
245
+ if torch.is_tensor(v):
246
+ inputs[k] = v.to(target_device)
247
+
248
+ print(f"Starting generation with cfg_scale: {args.cfg_scale}")
249
+
250
+ # Generate audio
251
+ start_time = time.time()
252
+ outputs = model.generate(
253
+ **inputs,
254
+ max_new_tokens=None,
255
+ cfg_scale=args.cfg_scale,
256
+ tokenizer=processor.tokenizer,
257
+ generation_config={'do_sample': False},
258
+ verbose=True,
259
+ all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs) if all_prefilled_outputs is not None else None,
260
+ )
261
+ generation_time = time.time() - start_time
262
+ print(f"Generation time: {generation_time:.2f} seconds")
263
+
264
+ # Calculate audio duration and additional metrics
265
+ if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
266
+ # Assuming 24kHz sample rate (common for speech synthesis)
267
+ sample_rate = 24000
268
+ audio_samples = outputs.speech_outputs[0].shape[-1] if len(outputs.speech_outputs[0].shape) > 0 else len(outputs.speech_outputs[0])
269
+ audio_duration = audio_samples / sample_rate
270
+ rtf = generation_time / audio_duration if audio_duration > 0 else float('inf')
271
+
272
+ print(f"Generated audio duration: {audio_duration:.2f} seconds")
273
+ print(f"RTF (Real Time Factor): {rtf:.2f}x")
274
+ else:
275
+ print("No audio output generated")
276
+
277
+ # Calculate token metrics
278
+ input_tokens = inputs['tts_text_ids'].shape[1] # Number of input tokens
279
+ output_tokens = outputs.sequences.shape[1] # Total tokens (input + generated)
280
+ generated_tokens = output_tokens - input_tokens - all_prefilled_outputs['tts_lm']['last_hidden_state'].size(1)
281
+
282
+ print(f"Prefilling text tokens: {input_tokens}")
283
+ print(f"Generated speech tokens: {generated_tokens}")
284
+ print(f"Total tokens: {output_tokens}")
285
+
286
+ # Save output (processor handles device internally)
287
+ txt_filename = os.path.splitext(os.path.basename(args.txt_path))[0]
288
+ output_path = os.path.join(args.output_dir, f"{txt_filename}_generated.wav")
289
+ os.makedirs(args.output_dir, exist_ok=True)
290
+
291
+ processor.save_audio(
292
+ outputs.speech_outputs[0], # First (and only) batch item
293
+ output_path=output_path,
294
+ )
295
+ print(f"Saved output to {output_path}")
296
+
297
+ # Print summary
298
+ print("\n" + "="*50)
299
+ print("GENERATION SUMMARY")
300
+ print("="*50)
301
+ print(f"Input file: {args.txt_path}")
302
+ print(f"Output file: {output_path}")
303
+ print(f"Speaker names: {args.speaker_name}")
304
+ print(f"Prefilling text tokens: {input_tokens}")
305
+ print(f"Generated speech tokens: {generated_tokens}")
306
+ print(f"Total tokens: {output_tokens}")
307
+ print(f"Generation time: {generation_time:.2f} seconds")
308
+ print(f"Audio duration: {audio_duration:.2f} seconds")
309
+ print(f"RTF (Real Time Factor): {rtf:.2f}x")
310
+
311
+ print("="*50)
312
+
313
+ if __name__ == "__main__":
314
+ main()
demo/text_examples/1p_abs.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Generating long-form, multi-speaker conversational audio like podcasts poses significant challenges for traditional Text-to-Speech (TTS) systems, particularly in scalability, speaker consistency, and natural turn-taking. This report presents VibeVoice, a novel model designed to synthesize long-form speech with multiple speakers by employing the next-token diffusion framework, a unified method for modeling continuous data by autoregressively generating latent vectors via diffusion.
2
+ A core component of our approach is the continuous speech tokenizers operating at an ultra-low frame rate of 7.5. This tokenizer effectively preserves audio fidelity while significantly boosting computational efficiency for processing long sequences. This enables VibeVoice to synthesize long-form speech for up to 90 minutes (in a 64K context window length) with up to 4 speakers, capturing the authentic conversational "vibe" and surpassing all known open-source and closed-source dialogue models (for example, Gemini 2.5 Pro Preview TTS). Code and checkpoint are available now.
demo/text_examples/1p_vibevoice.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, such as podcasts, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous speech tokenizers operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice employs a next-token diffusion framework, leveraging a Large Language Model to understand textual context and dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to 90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.
demo/vibevoice_realtime_colab.ipynb ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d1785adb",
6
+ "metadata": {
7
+ "colab_type": "text",
8
+ "id": "view-in-github"
9
+ },
10
+ "source": [
11
+ "<a href=\"https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "id": "WvIaUJD2y0yU",
17
+ "metadata": {
18
+ "id": "WvIaUJD2y0yU"
19
+ },
20
+ "source": [
21
+ "# VibeVoice-Realtime Colab — T4 Quickstart\n",
22
+ "\n"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "id": "e8fTKYGx7DZk",
28
+ "metadata": {
29
+ "id": "e8fTKYGx7DZk"
30
+ },
31
+ "source": [
32
+ "## Step 1: Setup Environment"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "4wxJ6QHM-ZOb",
39
+ "metadata": {
40
+ "id": "4wxJ6QHM-ZOb"
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "# Check for T4 GPU\n",
45
+ "import torch\n",
46
+ "if torch.cuda.is_available() and \"T4\" in torch.cuda.get_device_name(0):\n",
47
+ " print(\"✅ T4 GPU detected\")\n",
48
+ "else:\n",
49
+ " print(\"\"\"\n",
50
+ " ⚠️ WARNING: T4 GPU not detected\n",
51
+ "\n",
52
+ " The recommended runtime for this Colab notebook is \"T4 GPU\".\n",
53
+ "\n",
54
+ " To change the runtime type:\n",
55
+ "\n",
56
+ " 1. Click on \"Runtime\" in the top navigation menu\n",
57
+ " 2. Click on \"Change runtime type\"\n",
58
+ " 3. Select \"T4 GPU\"\n",
59
+ " 4. Click \"OK\" if a \"Disconnect and delete runtime\" window appears\n",
60
+ " 5. Click on \"Save\"\n",
61
+ "\n",
62
+ " \"\"\")\n",
63
+ "\n",
64
+ "# Clone the VibeVoice repository\n",
65
+ "![ -d /content/VibeVoice ] || git clone --quiet --branch main --depth 1 https://github.com/microsoft/VibeVoice.git /content/VibeVoice\n",
66
+ "print(\"✅ Cloned VibeVoice repository\")\n",
67
+ "\n",
68
+ "# Install project dependencies\n",
69
+ "!uv pip --quiet install --system -e /content/VibeVoice\n",
70
+ "!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared && chmod +x cloudflared\n",
71
+ "print(\"✅ Installed dependencies\")\n",
72
+ "\n",
73
+ "# Download model\n",
74
+ "from huggingface_hub import snapshot_download\n",
75
+ "snapshot_download(\"microsoft/VibeVoice-Realtime-0.5B\", local_dir=\"/content/models/VibeVoice-Realtime-0.5B\")\n",
76
+ "print(\"✅ Downloaded model: microsoft/VibeVoice-Realtime-0.5B\")\n"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "88c727ab",
82
+ "metadata": {},
83
+ "source": [
84
+ "[Optional] If the download exceeds 1 minute, it is probably stuck. You can: (1) interrupt the execution, (2) log in to Hugging Face, and (3) try download again."
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "id": "dec6b870",
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "from huggingface_hub import login\n",
95
+ "login()"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "c579654b",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "snapshot_download(\"microsoft/VibeVoice-Realtime-0.5B\", local_dir=\"/content/models/VibeVoice-Realtime-0.5B\")\n",
106
+ "print(\"✅ Downloaded model: microsoft/VibeVoice-Realtime-0.5B\")"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "id": "pgKlV7153Ifi",
112
+ "metadata": {
113
+ "id": "pgKlV7153Ifi"
114
+ },
115
+ "source": [
116
+ "## Step 2: Launch VibeVoice-Realtime Demo"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": null,
122
+ "id": "Yc1N9EHswFxA",
123
+ "metadata": {
124
+ "id": "Yc1N9EHswFxA"
125
+ },
126
+ "outputs": [],
127
+ "source": [
128
+ "import subprocess, re, time, threading\n",
129
+ "\n",
130
+ "srv = subprocess.Popen(\n",
131
+ " \"python /content/VibeVoice/demo/vibevoice_realtime_demo.py --model_path /content/models/VibeVoice-Realtime-0.5B --port 8000\",\n",
132
+ " shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True,\n",
133
+ ")\n",
134
+ "cf = subprocess.Popen(\n",
135
+ " \"./cloudflared tunnel --url http://localhost:8000 --no-autoupdate\",\n",
136
+ " shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True,\n",
137
+ ")\n",
138
+ "\n",
139
+ "public_url = None\n",
140
+ "server_ready = False\n",
141
+ "url_pattern = re.compile(r\"(https://[a-z0-9-]+\\.trycloudflare\\.com)\")\n",
142
+ "\n",
143
+ "def read_srv():\n",
144
+ " global server_ready\n",
145
+ " for ln in srv.stdout:\n",
146
+ " print(ln.strip())\n",
147
+ " if \"Uvicorn running on\" in ln:\n",
148
+ " server_ready = True\n",
149
+ "\n",
150
+ "def read_cf():\n",
151
+ " global public_url\n",
152
+ " for ln in cf.stdout:\n",
153
+ " m = url_pattern.search(ln)\n",
154
+ " if m:\n",
155
+ " public_url = m.group(1)\n",
156
+ " break\n",
157
+ "\n",
158
+ "threading.Thread(target=read_srv, daemon=True).start()\n",
159
+ "threading.Thread(target=read_cf, daemon=True).start()\n",
160
+ "\n",
161
+ "\n",
162
+ "while True:\n",
163
+ " if server_ready and public_url:\n",
164
+ " print(f\"✅ Public URL: {public_url}\\n\");\n",
165
+ " public_url = None\n",
166
+ " time.sleep(0.25)"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "accelerator": "GPU",
172
+ "colab": {
173
+ "gpuType": "T4",
174
+ "include_colab_link": true,
175
+ "machine_shape": "hm",
176
+ "name": "VibeVoice_Colab.ipynb",
177
+ "provenance": []
178
+ },
179
+ "kernelspec": {
180
+ "display_name": "Python 3",
181
+ "name": "python3"
182
+ },
183
+ "language_info": {
184
+ "codemirror_mode": {
185
+ "name": "ipython",
186
+ "version": 3
187
+ },
188
+ "file_extension": ".py",
189
+ "mimetype": "text/x-python",
190
+ "name": "python",
191
+ "nbconvert_exporter": "python",
192
+ "pygments_lexer": "ipython3",
193
+ "version": "3.10.11"
194
+ }
195
+ },
196
+ "nbformat": 4,
197
+ "nbformat_minor": 5
198
+ }
demo/vibevoice_realtime_demo.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse, os, uvicorn
2
+
3
+ def main():
4
+ p = argparse.ArgumentParser()
5
+ p.add_argument("--port", type=int, default=3000)
6
+ p.add_argument("--model_path", type=str, default="default_model")
7
+ p.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda", "mpx", "mps"])
8
+ p.add_argument("--reload", action="store_true", help="Reload the model or not")
9
+ args = p.parse_args()
10
+
11
+ os.environ["MODEL_PATH"] = args.model_path
12
+ os.environ["MODEL_DEVICE"] = args.device
13
+
14
+ uvicorn.run("web.app:app", host="0.0.0.0", port=args.port, reload=args.reload)
15
+
16
+ if __name__ == "__main__":
17
+ main()
demo/voices/streaming_model/de-Spk0_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba2c3e06c52ca02a851326d8a354188c8cafe4543c717d8beb8b64fe3466913a
3
+ size 7039666
demo/voices/streaming_model/de-Spk1_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11b4fa505f7d636af7047632793fe639b1e826d46fc6c00d64dacdcf0805ad72
3
+ size 5290778
demo/voices/streaming_model/en-Carter_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7bfdf1cd4939c22469bcfc6f427ae9c4467b3df46c2c14303a39c294cfc6897
3
+ size 4256002
demo/voices/streaming_model/en-Davis_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67561d63bfa2153616e4c02fd967007c182593fc53738a6ad94bf5f84e8832ac
3
+ size 2471258
demo/voices/streaming_model/en-Emma_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75b15c481e0d848991f1789620aa9929c583ec2c5f701f8152362cf74498bbf8
3
+ size 3343090
demo/voices/streaming_model/en-Frank_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acaa8f1a4f46a79f8f5660cfb7a3af06ef473389319df7debc07376fdc840e47
3
+ size 3359578
demo/voices/streaming_model/en-Grace_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0ef02a3f3cace04cf721608b65273879466bb15fe4044e46ec6842190f6bb1
3
+ size 2772466
demo/voices/streaming_model/en-Mike_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afb64b580fbc6fab09af04572bbbd2b3906ff8ed35a28731a90b8681e47bdc89
3
+ size 2016234
demo/voices/streaming_model/fr-Spk0_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64f12314f9df2348f0fd6cdbee9efb0f1ebfc286034560e207be5efc4108a368
3
+ size 4386482
demo/voices/streaming_model/fr-Spk1_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1425e8b7fab2fb03d400ca23d672ca96dbbb6b7b000e52332117f2fece33077
3
+ size 4272170
demo/voices/streaming_model/in-Samuel_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6ec5e8cde4006aa3f26fde0422124f3296f83f5738b5b4506268855a305cb06
3
+ size 3782658
demo/voices/streaming_model/it-Spk0_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a838184c0304802c696bac887d15a8143539fbb0b5371fe53de66059a010f3d
3
+ size 2552026
demo/voices/streaming_model/it-Spk1_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349ce0d28e93ea8b1df55800527f3a8ba8a55bb9e88990841db1bc9ee424519a
3
+ size 2854514
demo/voices/streaming_model/jp-Spk0_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4881286aa2fe14e65a800d64a7ae7cd0a7ccda53ff6de88a8d6590980c20e16d
3
+ size 4668234
demo/voices/streaming_model/jp-Spk1_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7a3f5ca602912152c483516bc0198ea13765affd6fbf16fc1bed5e91e1cbbda
3
+ size 4637994
demo/voices/streaming_model/kr-Spk0_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6d15b8d66f8c271655d000b5923cb33a9b564bf84484a82b10431dc1a2fafef
3
+ size 4154002
demo/voices/streaming_model/kr-Spk1_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65514854c7ffc7080aba964cf40811822358b19397132ddaf9b3cdb168c394f3
3
+ size 5865138
demo/voices/streaming_model/nl-Spk0_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e9221ed11737e2b32af3ccfebb673bd1122f9f9ea8d357efd91e9f3b8e8f72
3
+ size 3704498
demo/voices/streaming_model/nl-Spk1_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76bfd3c13bb9d61760f6146c4eadc032e9d19ddecd4eed6832bc323a8cb5e8c5
3
+ size 5095874
demo/voices/streaming_model/pl-Spk0_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c06a9e251ab08ade328dfe16e01eb6f5ddc4ba9c66695a73e3aae8a919396bd
3
+ size 3750522
demo/voices/streaming_model/pl-Spk1_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a30782a49635f0e60247529b4557cd39713fd158ce2f99bea2ac506bffe7427
3
+ size 4978626
demo/voices/streaming_model/pt-Spk0_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40f3a231e011f9c5e1eb4e852599751ce749439edceb36d0fa2bca952387f5a9
3
+ size 2268290
demo/voices/streaming_model/pt-Spk1_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a0052170a4ae5217931eb84318a6f86279439b536ad9780c724526d84148e5
3
+ size 3554890
demo/voices/streaming_model/sp-Spk0_woman.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b2c54cfe9b46a711642a96cdcc49545140374ae769cc62cf89c593c38b13c0
3
+ size 4243906
demo/voices/streaming_model/sp-Spk1_man.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9857e3c03334b1934d34dd3aed8b36cb1e6a09367e99c9e164cd3cd8f091edcb
3
+ size 5130522
demo/web/app.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import builtins
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import threading
7
+ import traceback
8
+ from pathlib import Path
9
+ from queue import Empty, Queue
10
+ from typing import Any, Callable, Dict, Iterator, Optional, Tuple, cast
11
+
12
+ import numpy as np
13
+ import torch
14
+ from fastapi import FastAPI, WebSocket
15
+ from fastapi.responses import FileResponse
16
+ from fastapi.staticfiles import StaticFiles
17
+ from starlette.websockets import WebSocketDisconnect, WebSocketState
18
+
19
+ from vibevoice.modular.modeling_vibevoice_streaming_inference import (
20
+ VibeVoiceStreamingForConditionalGenerationInference,
21
+ )
22
+ from vibevoice.processor.vibevoice_streaming_processor import (
23
+ VibeVoiceStreamingProcessor,
24
+ )
25
+ from vibevoice.modular.streamer import AudioStreamer
26
+
27
+ import copy
28
+
29
+ BASE = Path(__file__).parent
30
+ SAMPLE_RATE = 24_000
31
+
32
+
33
+ def get_timestamp():
34
+ timestamp = datetime.datetime.utcnow().replace(
35
+ tzinfo=datetime.timezone.utc
36
+ ).astimezone(
37
+ datetime.timezone(datetime.timedelta(hours=8))
38
+ ).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
39
+ return timestamp
40
+
41
+ class StreamingTTSService:
42
+ def __init__(
43
+ self,
44
+ model_path: str,
45
+ device: str = "cuda",
46
+ inference_steps: int = 5,
47
+ ) -> None:
48
+ # Keep model_path as string for HuggingFace repo IDs (Path() converts / to \ on Windows)
49
+ self.model_path = model_path
50
+ self.inference_steps = inference_steps
51
+ self.sample_rate = SAMPLE_RATE
52
+
53
+ self.processor: Optional[VibeVoiceStreamingProcessor] = None
54
+ self.model: Optional[VibeVoiceStreamingForConditionalGenerationInference] = None
55
+ self.voice_presets: Dict[str, Path] = {}
56
+ self.default_voice_key: Optional[str] = None
57
+ self._voice_cache: Dict[str, Tuple[object, Path, str]] = {}
58
+
59
+ if device == "mpx":
60
+ print("Note: device 'mpx' detected, treating it as 'mps'.")
61
+ device = "mps"
62
+ if device == "mps" and not torch.backends.mps.is_available():
63
+ print("Warning: MPS not available. Falling back to CPU.")
64
+ device = "cpu"
65
+ self.device = device
66
+ self._torch_device = torch.device(device)
67
+
68
+ def load(self) -> None:
69
+ print(f"[startup] Loading processor from {self.model_path}")
70
+ self.processor = VibeVoiceStreamingProcessor.from_pretrained(self.model_path)
71
+
72
+
73
+ # Decide dtype & attention
74
+ if self.device == "mps":
75
+ load_dtype = torch.float32
76
+ device_map = None
77
+ attn_impl_primary = "sdpa"
78
+ elif self.device == "cuda":
79
+ load_dtype = torch.bfloat16
80
+ device_map = 'cuda'
81
+ attn_impl_primary = "flash_attention_2"
82
+ else:
83
+ load_dtype = torch.float32
84
+ device_map = 'cpu'
85
+ attn_impl_primary = "sdpa"
86
+ print(f"Using device: {device_map}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
87
+ # Load model
88
+ try:
89
+ self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
90
+ self.model_path,
91
+ torch_dtype=load_dtype,
92
+ device_map=device_map,
93
+ attn_implementation=attn_impl_primary,
94
+ )
95
+
96
+ if self.device == "mps":
97
+ self.model.to("mps")
98
+ except Exception as e:
99
+ if attn_impl_primary == 'flash_attention_2':
100
+ print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
101
+
102
+ self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
103
+ self.model_path,
104
+ torch_dtype=load_dtype,
105
+ device_map=self.device,
106
+ attn_implementation='sdpa',
107
+ )
108
+ print("Load model with SDPA successfully ")
109
+ else:
110
+ raise e
111
+
112
+ self.model.eval()
113
+
114
+ self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
115
+ self.model.model.noise_scheduler.config,
116
+ algorithm_type="sde-dpmsolver++",
117
+ beta_schedule="squaredcos_cap_v2",
118
+ )
119
+ self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
120
+
121
+ self.voice_presets = self._load_voice_presets()
122
+ preset_name = os.environ.get("VOICE_PRESET")
123
+ self.default_voice_key = self._determine_voice_key(preset_name)
124
+ self._ensure_voice_cached(self.default_voice_key)
125
+
126
+ def _load_voice_presets(self) -> Dict[str, Path]:
127
+ voices_dir = BASE.parent / "voices" / "streaming_model"
128
+ if not voices_dir.exists():
129
+ raise RuntimeError(f"Voices directory not found: {voices_dir}")
130
+
131
+ presets: Dict[str, Path] = {}
132
+ for pt_path in voices_dir.glob("*.pt"):
133
+ presets[pt_path.stem] = pt_path
134
+
135
+ if not presets:
136
+ raise RuntimeError(f"No voice preset (.pt) files found in {voices_dir}")
137
+
138
+ print(f"[startup] Found {len(presets)} voice presets")
139
+ return dict(sorted(presets.items()))
140
+
141
+ def _determine_voice_key(self, name: Optional[str]) -> str:
142
+ if name and name in self.voice_presets:
143
+ return name
144
+
145
+ default_key = "en-WHTest_man"
146
+ if default_key in self.voice_presets:
147
+ return default_key
148
+
149
+ first_key = next(iter(self.voice_presets))
150
+ print(f"[startup] Using fallback voice preset: {first_key}")
151
+ return first_key
152
+
153
+ def _ensure_voice_cached(self, key: str) -> Tuple[object, Path, str]:
154
+ if key not in self.voice_presets:
155
+ raise RuntimeError(f"Voice preset {key!r} not found")
156
+
157
+ if key not in self._voice_cache:
158
+ preset_path = self.voice_presets[key]
159
+ print(f"[startup] Loading voice preset {key} from {preset_path}")
160
+ print(f"[startup] Loading prefilled prompt from {preset_path}")
161
+ prefilled_outputs = torch.load(
162
+ preset_path,
163
+ map_location=self._torch_device,
164
+ weights_only=False,
165
+ )
166
+ self._voice_cache[key] = prefilled_outputs
167
+
168
+ return self._voice_cache[key]
169
+
170
+ def _get_voice_resources(self, requested_key: Optional[str]) -> Tuple[str, object, Path, str]:
171
+ key = requested_key if requested_key and requested_key in self.voice_presets else self.default_voice_key
172
+ if key is None:
173
+ key = next(iter(self.voice_presets))
174
+ self.default_voice_key = key
175
+
176
+ prefilled_outputs = self._ensure_voice_cached(key)
177
+ return key, prefilled_outputs
178
+
179
+ def _prepare_inputs(self, text: str, prefilled_outputs: object):
180
+ if not self.processor or not self.model:
181
+ raise RuntimeError("StreamingTTSService not initialized")
182
+
183
+ processor_kwargs = {
184
+ "text": text.strip(),
185
+ "cached_prompt": prefilled_outputs,
186
+ "padding": True,
187
+ "return_tensors": "pt",
188
+ "return_attention_mask": True,
189
+ }
190
+
191
+ processed = self.processor.process_input_with_cached_prompt(**processor_kwargs)
192
+
193
+ prepared = {
194
+ key: value.to(self._torch_device) if hasattr(value, "to") else value
195
+ for key, value in processed.items()
196
+ }
197
+ return prepared
198
+
199
+ def _run_generation(
200
+ self,
201
+ inputs,
202
+ audio_streamer: AudioStreamer,
203
+ errors,
204
+ cfg_scale: float,
205
+ do_sample: bool,
206
+ temperature: float,
207
+ top_p: float,
208
+ refresh_negative: bool,
209
+ prefilled_outputs,
210
+ stop_event: threading.Event,
211
+ ) -> None:
212
+ try:
213
+ self.model.generate(
214
+ **inputs,
215
+ max_new_tokens=None,
216
+ cfg_scale=cfg_scale,
217
+ tokenizer=self.processor.tokenizer,
218
+ generation_config={
219
+ "do_sample": do_sample,
220
+ "temperature": temperature if do_sample else 1.0,
221
+ "top_p": top_p if do_sample else 1.0,
222
+ },
223
+ audio_streamer=audio_streamer,
224
+ stop_check_fn=stop_event.is_set,
225
+ verbose=False,
226
+ refresh_negative=refresh_negative,
227
+ all_prefilled_outputs=copy.deepcopy(prefilled_outputs),
228
+ )
229
+ except Exception as exc: # pragma: no cover - diagnostic logging
230
+ errors.append(exc)
231
+ traceback.print_exc()
232
+ audio_streamer.end()
233
+
234
+ def stream(
235
+ self,
236
+ text: str,
237
+ cfg_scale: float = 1.5,
238
+ do_sample: bool = False,
239
+ temperature: float = 0.9,
240
+ top_p: float = 0.9,
241
+ refresh_negative: bool = True,
242
+ inference_steps: Optional[int] = None,
243
+ voice_key: Optional[str] = None,
244
+ log_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
245
+ stop_event: Optional[threading.Event] = None,
246
+ ) -> Iterator[np.ndarray]:
247
+ if not text.strip():
248
+ return
249
+ text = text.replace("’", "'")
250
+ selected_voice, prefilled_outputs = self._get_voice_resources(voice_key)
251
+
252
+ def emit(event: str, **payload: Any) -> None:
253
+ if log_callback:
254
+ try:
255
+ log_callback(event, **payload)
256
+ except Exception as exc:
257
+ print(f"[log_callback] Error while emitting {event}: {exc}")
258
+
259
+ steps_to_use = self.inference_steps
260
+ if inference_steps is not None:
261
+ try:
262
+ parsed_steps = int(inference_steps)
263
+ if parsed_steps > 0:
264
+ steps_to_use = parsed_steps
265
+ except (TypeError, ValueError):
266
+ pass
267
+ if self.model:
268
+ self.model.set_ddpm_inference_steps(num_steps=steps_to_use)
269
+ self.inference_steps = steps_to_use
270
+
271
+ inputs = self._prepare_inputs(text, prefilled_outputs)
272
+ audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
273
+ errors: list = []
274
+ stop_signal = stop_event or threading.Event()
275
+
276
+ thread = threading.Thread(
277
+ target=self._run_generation,
278
+ kwargs={
279
+ "inputs": inputs,
280
+ "audio_streamer": audio_streamer,
281
+ "errors": errors,
282
+ "cfg_scale": cfg_scale,
283
+ "do_sample": do_sample,
284
+ "temperature": temperature,
285
+ "top_p": top_p,
286
+ "refresh_negative": refresh_negative,
287
+ "prefilled_outputs": prefilled_outputs,
288
+ "stop_event": stop_signal,
289
+ },
290
+ daemon=True,
291
+ )
292
+ thread.start()
293
+
294
+ generated_samples = 0
295
+
296
+ try:
297
+ stream = audio_streamer.get_stream(0)
298
+ for audio_chunk in stream:
299
+ if torch.is_tensor(audio_chunk):
300
+ audio_chunk = audio_chunk.detach().cpu().to(torch.float32).numpy()
301
+ else:
302
+ audio_chunk = np.asarray(audio_chunk, dtype=np.float32)
303
+
304
+ if audio_chunk.ndim > 1:
305
+ audio_chunk = audio_chunk.reshape(-1)
306
+
307
+ peak = np.max(np.abs(audio_chunk)) if audio_chunk.size else 0.0
308
+ if peak > 1.0:
309
+ audio_chunk = audio_chunk / peak
310
+
311
+ generated_samples += int(audio_chunk.size)
312
+ emit(
313
+ "model_progress",
314
+ generated_sec=generated_samples / self.sample_rate,
315
+ chunk_sec=audio_chunk.size / self.sample_rate,
316
+ )
317
+
318
+ chunk_to_yield = audio_chunk.astype(np.float32, copy=False)
319
+
320
+ yield chunk_to_yield
321
+ finally:
322
+ stop_signal.set()
323
+ audio_streamer.end()
324
+ thread.join()
325
+ if errors:
326
+ emit("generation_error", message=str(errors[0]))
327
+ raise errors[0]
328
+
329
+ def chunk_to_pcm16(self, chunk: np.ndarray) -> bytes:
330
+ chunk = np.clip(chunk, -1.0, 1.0)
331
+ pcm = (chunk * 32767.0).astype(np.int16)
332
+ return pcm.tobytes()
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ @app.on_event("startup")
339
+ async def _startup() -> None:
340
+ model_path = os.environ.get("MODEL_PATH")
341
+ if not model_path:
342
+ raise RuntimeError("MODEL_PATH not set in environment")
343
+
344
+ device = os.environ.get("MODEL_DEVICE", "cuda")
345
+
346
+ service = StreamingTTSService(
347
+ model_path=model_path,
348
+ device=device
349
+ )
350
+ service.load()
351
+
352
+ app.state.tts_service = service
353
+ app.state.model_path = model_path
354
+ app.state.device = device
355
+ app.state.websocket_lock = asyncio.Lock()
356
+ print("[startup] Model ready.")
357
+
358
+
359
+ def streaming_tts(text: str, **kwargs) -> Iterator[np.ndarray]:
360
+ service: StreamingTTSService = app.state.tts_service
361
+ yield from service.stream(text, **kwargs)
362
+
363
+ @app.websocket("/stream")
364
+ async def websocket_stream(ws: WebSocket) -> None:
365
+ await ws.accept()
366
+ text = ws.query_params.get("text", "")
367
+ print(f"Client connected, text={text!r}")
368
+ cfg_param = ws.query_params.get("cfg")
369
+ steps_param = ws.query_params.get("steps")
370
+ voice_param = ws.query_params.get("voice")
371
+
372
+ try:
373
+ cfg_scale = float(cfg_param) if cfg_param is not None else 1.5
374
+ except ValueError:
375
+ cfg_scale = 1.5
376
+ if cfg_scale <= 0:
377
+ cfg_scale = 1.5
378
+ try:
379
+ inference_steps = int(steps_param) if steps_param is not None else None
380
+ if inference_steps is not None and inference_steps <= 0:
381
+ inference_steps = None
382
+ except ValueError:
383
+ inference_steps = None
384
+
385
+ service: StreamingTTSService = app.state.tts_service
386
+ lock: asyncio.Lock = app.state.websocket_lock
387
+
388
+ if lock.locked():
389
+ busy_message = {
390
+ "type": "log",
391
+ "event": "backend_busy",
392
+ "data": {"message": "Please wait for the other requests to complete."},
393
+ "timestamp": get_timestamp(),
394
+ }
395
+ print("Please wait for the other requests to complete.")
396
+ try:
397
+ await ws.send_text(json.dumps(busy_message))
398
+ except Exception:
399
+ pass
400
+ await ws.close(code=1013, reason="Service busy")
401
+ return
402
+
403
+ acquired = False
404
+ try:
405
+ await lock.acquire()
406
+ acquired = True
407
+
408
+ log_queue: "Queue[Dict[str, Any]]" = Queue()
409
+
410
+ def enqueue_log(event: str, **data: Any) -> None:
411
+ log_queue.put({"event": event, "data": data})
412
+
413
+ async def flush_logs() -> None:
414
+ while True:
415
+ try:
416
+ entry = log_queue.get_nowait()
417
+ except Empty:
418
+ break
419
+ message = {
420
+ "type": "log",
421
+ "event": entry.get("event"),
422
+ "data": entry.get("data", {}),
423
+ "timestamp": get_timestamp(),
424
+ }
425
+ try:
426
+ await ws.send_text(json.dumps(message))
427
+ except Exception:
428
+ break
429
+
430
+ enqueue_log(
431
+ "backend_request_received",
432
+ text_length=len(text or ""),
433
+ cfg_scale=cfg_scale,
434
+ inference_steps=inference_steps,
435
+ voice=voice_param,
436
+ )
437
+
438
+ stop_signal = threading.Event()
439
+
440
+ iterator = streaming_tts(
441
+ text,
442
+ cfg_scale=cfg_scale,
443
+ inference_steps=inference_steps,
444
+ voice_key=voice_param,
445
+ log_callback=enqueue_log,
446
+ stop_event=stop_signal,
447
+ )
448
+ sentinel = object()
449
+ first_ws_send_logged = False
450
+
451
+ await flush_logs()
452
+
453
+ try:
454
+ while ws.client_state == WebSocketState.CONNECTED:
455
+ await flush_logs()
456
+ chunk = await asyncio.to_thread(next, iterator, sentinel)
457
+ if chunk is sentinel:
458
+ break
459
+ chunk = cast(np.ndarray, chunk)
460
+ payload = service.chunk_to_pcm16(chunk)
461
+ await ws.send_bytes(payload)
462
+ if not first_ws_send_logged:
463
+ first_ws_send_logged = True
464
+ enqueue_log("backend_first_chunk_sent")
465
+ await flush_logs()
466
+ except WebSocketDisconnect:
467
+ print("Client disconnected (WebSocketDisconnect)")
468
+ enqueue_log("client_disconnected")
469
+ stop_signal.set()
470
+ finally:
471
+ stop_signal.set()
472
+ enqueue_log("backend_stream_complete")
473
+ await flush_logs()
474
+ try:
475
+ iterator_close = getattr(iterator, "close", None)
476
+ if callable(iterator_close):
477
+ iterator_close()
478
+ except Exception:
479
+ pass
480
+ # clear the log queue
481
+ while not log_queue.empty():
482
+ try:
483
+ log_queue.get_nowait()
484
+ except Empty:
485
+ break
486
+ if ws.client_state == WebSocketState.CONNECTED:
487
+ await ws.close()
488
+ print("WS handler exit")
489
+ finally:
490
+ if acquired:
491
+ lock.release()
492
+
493
+
494
+ @app.get("/")
495
+ def index():
496
+ return FileResponse(BASE / "index.html")
497
+
498
+
499
+ @app.get("/config")
500
+ def get_config():
501
+ service: StreamingTTSService = app.state.tts_service
502
+ voices = sorted(service.voice_presets.keys())
503
+ return {
504
+ "voices": voices,
505
+ "default_voice": service.default_voice_key,
506
+ }
507
+
demo/web/index.html ADDED
@@ -0,0 +1,1017 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <meta charset="UTF-8" />
4
+ <title>VibeVoice-Realtime TTS Demo</title>
5
+ <style>
6
+ :root {
7
+ --bg: #f5f7fc;
8
+ --surface: #ffffff;
9
+ --accent: #5562ff;
10
+ --accent-strong: #3f4dff;
11
+ --text-primary: #1f2742;
12
+ --text-muted: #5d6789;
13
+ --border: rgba(85, 98, 255, 0.18);
14
+ --shadow: 0 18px 45px rgba(31, 39, 66, 0.08);
15
+ }
16
+
17
+ .helper-text {
18
+ font-size: 12px;
19
+ color: #8a93b5;
20
+ }
21
+
22
+ * {
23
+ box-sizing: border-box;
24
+ }
25
+
26
+ body {
27
+ margin: 0;
28
+ background: var(--bg);
29
+ font-family: 'Inter', 'Segoe UI', Roboto, Helvetica, sans-serif;
30
+ color: var(--text-primary);
31
+ display: flex;
32
+ justify-content: center;
33
+ padding: 48px 20px;
34
+ }
35
+
36
+ .app-shell {
37
+ width: min(960px, 100%);
38
+ background: var(--surface);
39
+ border-radius: 20px;
40
+ padding: 36px 40px 44px;
41
+ box-shadow: var(--shadow);
42
+ display: flex;
43
+ flex-direction: column;
44
+ gap: 28px;
45
+ }
46
+
47
+ h1 {
48
+ margin: 0;
49
+ text-align: center;
50
+ font-size: 30px;
51
+ font-weight: 700;
52
+ letter-spacing: 0.01em;
53
+ }
54
+
55
+ .panel {
56
+ display: flex;
57
+ flex-direction: column;
58
+ gap: 10px;
59
+ }
60
+
61
+ .field {
62
+ display: flex;
63
+ flex-direction: column;
64
+ gap: 8px;
65
+ }
66
+
67
+ .field-label {
68
+ font-weight: 600;
69
+ font-size: 15px;
70
+ color: var(--text-primary);
71
+ }
72
+
73
+ .text-input {
74
+ width: 100%;
75
+ min-height: 140px;
76
+ max-height: 240px;
77
+ border: 1px solid rgba(31, 39, 66, 0.14);
78
+ border-radius: 12px;
79
+ padding: 14px 16px;
80
+ font-size: 15px;
81
+ line-height: 1.6;
82
+ font-family: inherit;
83
+ background: #f9faff;
84
+ transition: border-color 0.2s, box-shadow 0.2s;
85
+ resize: vertical;
86
+ }
87
+
88
+ .text-input:focus {
89
+ outline: none;
90
+ border-color: var(--accent);
91
+ box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
92
+ background: #fff;
93
+ }
94
+
95
+ #streamingPreviewContainer {
96
+ border-radius: 14px;
97
+ border: 1px solid var(--border);
98
+ background: linear-gradient(135deg, #eef2ff 0%, #f7f9ff 100%);
99
+ padding: 18px 20px;
100
+ box-shadow: inset 0 1px 2px rgba(85, 98, 255, 0.12);
101
+ }
102
+
103
+ #streamingPreviewHeader {
104
+ font-weight: 600;
105
+ color: var(--text-primary);
106
+ display: flex;
107
+ align-items: center;
108
+ gap: 10px;
109
+ font-size: 14px;
110
+ margin-bottom: 8px;
111
+ }
112
+
113
+ #streamingPreviewNote {
114
+ font-weight: 400;
115
+ font-size: 12px;
116
+ color: var(--text-muted);
117
+ }
118
+
119
+ #streamingPreview {
120
+ min-height: 70px;
121
+ padding: 10px 12px;
122
+ border-radius: 10px;
123
+ background: rgba(255, 255, 255, 0.9);
124
+ border: 1px solid rgba(85, 98, 255, 0.25);
125
+ font-family: 'Courier New', Courier, monospace;
126
+ font-size: 14px;
127
+ line-height: 1.5;
128
+ color: var(--text-primary);
129
+ white-space: pre-wrap;
130
+ }
131
+
132
+ #streamingPreview.streaming-active::after {
133
+ content: "";
134
+ display: inline-block;
135
+ width: 2px;
136
+ height: 1.1em;
137
+ background: var(--accent);
138
+ margin-left: 2px;
139
+ animation: previewCaret 0.9s steps(1) infinite;
140
+ vertical-align: bottom;
141
+ }
142
+
143
+ @keyframes previewCaret {
144
+ 0%, 50% {
145
+ opacity: 1;
146
+ }
147
+ 51%, 100% {
148
+ opacity: 0;
149
+ }
150
+ }
151
+
152
+ .control-panel {
153
+ display: flex;
154
+ flex-direction: column;
155
+ gap: 18px;
156
+ }
157
+
158
+ .inline-field {
159
+ display: flex;
160
+ flex-direction: column;
161
+ gap: 6px;
162
+ }
163
+
164
+ .select-control {
165
+ width: 220px;
166
+ border: 1px solid rgba(31, 39, 66, 0.14);
167
+ border-radius: 10px;
168
+ padding: 8px 12px;
169
+ font-size: 14px;
170
+ font-family: inherit;
171
+ background: #fbfcff;
172
+ color: var(--text-primary);
173
+ transition: border-color 0.2s, box-shadow 0.2s;
174
+ }
175
+
176
+ .select-control:focus {
177
+ outline: none;
178
+ border-color: var(--accent);
179
+ box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
180
+ background: #fff;
181
+ }
182
+
183
+ .control-row {
184
+ display: flex;
185
+ align-items: center;
186
+ flex-wrap: wrap;
187
+ gap: 20px 28px;
188
+ }
189
+
190
+ .range-control {
191
+ display: flex;
192
+ align-items: center;
193
+ gap: 12px;
194
+ font-size: 14px;
195
+ color: var(--text-primary);
196
+ }
197
+
198
+ .range-control input[type="range"] {
199
+ width: 200px;
200
+ accent-color: var(--accent);
201
+ }
202
+
203
+ .range-value {
204
+ font-weight: 600;
205
+ color: var(--text-primary);
206
+ min-width: 42px;
207
+ text-align: right;
208
+ }
209
+
210
+ #playback {
211
+ background: var(--accent);
212
+ color: #fff;
213
+ border: none;
214
+ padding: 10px 24px;
215
+ border-radius: 999px;
216
+ cursor: pointer;
217
+ font-weight: 600;
218
+ font-size: 14px;
219
+ box-shadow: 0 8px 16px rgba(85, 98, 255, 0.25);
220
+ transition: transform 0.15s, box-shadow 0.15s, background 0.15s;
221
+ }
222
+
223
+ #playback:hover {
224
+ transform: translateY(-1px);
225
+ box-shadow: 0 10px 20px rgba(85, 98, 255, 0.28);
226
+ }
227
+
228
+ #playback:active {
229
+ transform: translateY(0);
230
+ }
231
+
232
+ #playback.playing {
233
+ background: var(--accent-strong);
234
+ }
235
+
236
+ .secondary-btn {
237
+ border: 1px solid rgba(31, 39, 66, 0.18);
238
+ background: #f1f3ff;
239
+ color: var(--text-primary);
240
+ padding: 8px 18px;
241
+ border-radius: 999px;
242
+ cursor: pointer;
243
+ font-size: 13px;
244
+ font-weight: 500;
245
+ transition: background 0.15s, border-color 0.15s;
246
+ }
247
+
248
+ .secondary-btn:hover {
249
+ background: #e6e9ff;
250
+ border-color: rgba(31, 39, 66, 0.26);
251
+ }
252
+
253
+ .secondary-btn:disabled {
254
+ opacity: 0.55;
255
+ cursor: not-allowed;
256
+ }
257
+
258
+ .metrics {
259
+ display: flex;
260
+ flex-wrap: wrap;
261
+ gap: 16px 32px;
262
+ font-size: 14px;
263
+ color: var(--text-muted);
264
+ }
265
+
266
+ .metrics span {
267
+ display: flex;
268
+ align-items: baseline;
269
+ gap: 6px;
270
+ }
271
+
272
+ .metrics span strong {
273
+ color: var(--text-primary);
274
+ font-weight: 600;
275
+ }
276
+
277
+ .metric-unit {
278
+ color: var(--text-muted);
279
+ font-size: 13px;
280
+ }
281
+
282
+ #logOutput {
283
+ max-height: 260px;
284
+ overflow-y: auto;
285
+ background: #f7f9ff;
286
+ color: var(--text-primary);
287
+ padding: 16px 18px;
288
+ border: 1px solid rgba(31, 39, 66, 0.12);
289
+ border-radius: 12px;
290
+ font-size: 13px;
291
+ line-height: 1.6;
292
+ box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.06);
293
+ font-family: 'Fira Code', 'Courier New', Courier, monospace;
294
+ margin-top: 0px;
295
+ }
296
+
297
+ @media (max-width: 720px) {
298
+ .app-shell {
299
+ padding: 28px 20px 36px;
300
+ gap: 24px;
301
+ }
302
+
303
+ .select-control {
304
+ width: 100%;
305
+ }
306
+
307
+ .control-row {
308
+ flex-direction: column;
309
+ align-items: flex-start;
310
+ gap: 16px;
311
+ }
312
+
313
+ #playback {
314
+ width: 100%;
315
+ text-align: center;
316
+ }
317
+ }
318
+ </style>
319
+ <body>
320
+ <div class="app-shell">
321
+ <h1>VibeVoice-Realtime TTS Demo</h1>
322
+
323
+ <section class="panel">
324
+ <label class="field">
325
+ <span class="field-label">Text</span>
326
+ <textarea
327
+ id="prompt"
328
+ class="text-input"
329
+ rows="4"
330
+ >Enter your text here and click "Start" to instantly hear the VibeVoice-Realtime TTS output audio.</textarea>
331
+ </label>
332
+
333
+ <div id="streamingPreviewContainer">
334
+ <div id="streamingPreviewHeader">
335
+ <span>Streaming Input Text</span>
336
+ </div>
337
+ <div id="streamingPreview" aria-live="polite">This area will display the streaming input text in real time.</div>
338
+ </div>
339
+ </section>
340
+ <span class="helper-text">This demo requires the full text to be provided upfront. The model then receives the text via streaming input during synthesis.<br>
341
+ For non-punctuation special characters, applying text normalization before processing often yields better results.</span>
342
+
343
+ <section class="panel control-panel">
344
+ <div class="inline-field">
345
+ <span class="field-label">Speaker</span>
346
+ <select id="voiceSelect" class="select-control">
347
+ <option value="">Loading...</option>
348
+ </select>
349
+ </div>
350
+
351
+ <div class="control-row">
352
+ <label class="range-control">
353
+ <span>CFG</span>
354
+ <input id="cfgScale" type="range" min="1.3" max="3" step="0.05" value="1.5" />
355
+ <span class="range-value" id="cfgValue">1.5</span>
356
+ </label>
357
+ <label class="range-control">
358
+ <span>Inference Steps</span>
359
+ <input id="inferenceSteps" type="range" min="5" max="20" step="1" value="5" />
360
+ <span class="range-value" id="stepsValue">5</span>
361
+ </label>
362
+ <button id="resetControls" type="button" class="secondary-btn">Reset Controls</button>
363
+ </div>
364
+
365
+ <div class="control-row">
366
+ <button id="playback">Start</button>
367
+ <button id="saveAudio" type="button" class="secondary-btn" disabled>Save</button>
368
+ </div>
369
+ </section>
370
+
371
+ <section class="panel">
372
+ <div class="metrics">
373
+ <span>Model Generated Audio<strong id="modelGenerated">0.00</strong><span class="metric-unit">s</span></span>
374
+ <span>Audio Played<strong id="playbackElapsed">0.00</strong><span class="metric-unit">s</span></span>
375
+ </div>
376
+ </section>
377
+
378
+ <section class="panel">
379
+ <span class="field-label">Runtime Logs</span>
380
+ <pre id="logOutput"></pre>
381
+ </section>
382
+ </div>
383
+
384
+
385
+ <script>
386
+ (() => {
387
+ const SAMPLE_RATE = 24_000;
388
+ const BUFFER_SIZE = 2048;
389
+ const PREBUFFER_SEC = 0.1;
390
+
391
+ let audioCtx = null;
392
+ let scriptNode = null;
393
+ let socket = null;
394
+ let buffer = new Float32Array(0);
395
+ let isPlaying = false;
396
+ let hasStartedPlayback = false;
397
+ let silentFrameCount = 0;
398
+
399
+ const promptInput = document.getElementById('prompt');
400
+ const streamingPreview = document.getElementById('streamingPreview');
401
+ const controlBtn = document.getElementById('playback');
402
+ const cfgSelect = document.getElementById('cfgScale');
403
+ const stepsSelect = document.getElementById('inferenceSteps');
404
+ const voiceSelect = document.getElementById('voiceSelect');
405
+ const cfgValueLabel = document.getElementById('cfgValue');
406
+ const stepsValueLabel = document.getElementById('stepsValue');
407
+ const modelGeneratedLabel = document.getElementById('modelGenerated');
408
+ const playbackElapsedLabel = document.getElementById('playbackElapsed');
409
+ const logOutput = document.getElementById('logOutput');
410
+ const resetBtn = document.getElementById('resetControls');
411
+ const saveBtn = document.getElementById('saveAudio');
412
+
413
+ let playbackTimer = null;
414
+ let lastPlaybackElapsed = 0;
415
+ let playbackSamples = 0;
416
+ let modelGeneratedTotal = 0;
417
+ let firstBrowserChunkLogged = false;
418
+ let playbackStartedLogged = false;
419
+ const logEntries = [];
420
+ let logSequence = 0;
421
+ let recordedChunks = [];
422
+ let recordedSamples = 0;
423
+ let recordingComplete = false;
424
+ let downloadUrl = null;
425
+
426
+ const revokeDownloadUrl = () => {
427
+ if (downloadUrl) {
428
+ URL.revokeObjectURL(downloadUrl);
429
+ downloadUrl = null;
430
+ }
431
+ };
432
+
433
+ const updateSaveButtonState = () => {
434
+ if (!saveBtn) {
435
+ return;
436
+ }
437
+ saveBtn.disabled = recordedSamples === 0 || !recordingComplete;
438
+ };
439
+
440
+ const clearRecordedChunks = () => {
441
+ recordedChunks = [];
442
+ recordedSamples = 0;
443
+ recordingComplete = false;
444
+ revokeDownloadUrl();
445
+ updateSaveButtonState();
446
+ };
447
+
448
+ const createWavBlob = () => {
449
+ if (!recordedSamples) {
450
+ return null;
451
+ }
452
+ const wavBuffer = new ArrayBuffer(44 + recordedSamples * 2);
453
+ const view = new DataView(wavBuffer);
454
+ const writeString = (offset, str) => {
455
+ for (let i = 0; i < str.length; i += 1) {
456
+ view.setUint8(offset + i, str.charCodeAt(i));
457
+ }
458
+ };
459
+
460
+ writeString(0, 'RIFF');
461
+ view.setUint32(4, 36 + recordedSamples * 2, true);
462
+ writeString(8, 'WAVE');
463
+ writeString(12, 'fmt ');
464
+ view.setUint32(16, 16, true);
465
+ view.setUint16(20, 1, true);
466
+ view.setUint16(22, 1, true);
467
+ view.setUint32(24, SAMPLE_RATE, true);
468
+ view.setUint32(28, SAMPLE_RATE * 2, true);
469
+ view.setUint16(32, 2, true);
470
+ view.setUint16(34, 16, true);
471
+ writeString(36, 'data');
472
+ view.setUint32(40, recordedSamples * 2, true);
473
+
474
+ const pcmData = new Int16Array(wavBuffer, 44, recordedSamples);
475
+ let offset = 0;
476
+ recordedChunks.forEach(chunk => {
477
+ const chunkData = new Int16Array(chunk);
478
+ pcmData.set(chunkData, offset);
479
+ offset += chunkData.length;
480
+ });
481
+ return new Blob([wavBuffer], { type: 'audio/wav' });
482
+ };
483
+
484
+ const updateCfgDisplay = () => {
485
+ cfgValueLabel.textContent = Number(cfgSelect.value).toFixed(2);
486
+ };
487
+
488
+ const updateStepsDisplay = () => {
489
+ stepsValueLabel.textContent = Number(stepsSelect.value).toString();
490
+ };
491
+
492
+ cfgSelect.addEventListener('input', updateCfgDisplay);
493
+ stepsSelect.addEventListener('input', updateStepsDisplay);
494
+ updateCfgDisplay();
495
+ updateStepsDisplay();
496
+
497
+ const pad2 = value => value.toString().padStart(2, '0');
498
+ const pad3 = value => value.toString().padStart(3, '0');
499
+
500
+ const formatLocalTimestamp = () => {
501
+ const d = new Date();
502
+ const year = d.getFullYear();
503
+ const month = pad2(d.getMonth() + 1);
504
+ const day = pad2(d.getDate());
505
+ const hours = pad2(d.getHours());
506
+ const minutes = pad2(d.getMinutes());
507
+ const seconds = pad2(d.getSeconds());
508
+ const millis = pad3(d.getMilliseconds());
509
+ return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}.${millis}`;
510
+ };
511
+
512
+ const formatSeconds = raw => {
513
+ const value = Number(raw);
514
+ return Number.isFinite(value) ? value.toFixed(2) : '0.00';
515
+ };
516
+
517
+ const parseTimestamp = value => {
518
+ if (!value) {
519
+ return new Date();
520
+ }
521
+ if (/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}/.test(value)) {
522
+ return new Date(value.replace(' ', 'T'));
523
+ }
524
+ return new Date(value);
525
+ };
526
+
527
+ const setModelGenerated = value => {
528
+ const numeric = Number(value);
529
+ if (!Number.isFinite(numeric)) {
530
+ return;
531
+ }
532
+ modelGeneratedTotal = Math.max(0, numeric);
533
+ modelGeneratedLabel.textContent = formatSeconds(modelGeneratedTotal);
534
+ };
535
+
536
+ const setPlaybackElapsed = value => {
537
+ const capped = Math.min(modelGeneratedTotal, Math.max(0, value));
538
+ lastPlaybackElapsed = capped;
539
+ playbackElapsedLabel.textContent = formatSeconds(lastPlaybackElapsed);
540
+ };
541
+
542
+ const STREAMING_WPM = 180;
543
+ const STREAMING_INTERVAL_MS = 60000 / STREAMING_WPM;
544
+ let previewTimeoutId = null;
545
+ let previewTokens = [];
546
+ let previewIndex = 0;
547
+ let previewActive = false;
548
+
549
+ const clearPreviewTimer = () => {
550
+ if (previewTimeoutId) {
551
+ clearTimeout(previewTimeoutId);
552
+ previewTimeoutId = null;
553
+ }
554
+ };
555
+
556
+ const setPreviewIdle = message => {
557
+ if (!streamingPreview) {
558
+ return;
559
+ }
560
+ streamingPreview.classList.remove('streaming-active');
561
+ streamingPreview.textContent = message;
562
+ };
563
+
564
+ const schedulePreviewTick = () => {
565
+ if (!streamingPreview) {
566
+ return;
567
+ }
568
+ if (previewIndex >= previewTokens.length) {
569
+ streamingPreview.classList.remove('streaming-active');
570
+ return;
571
+ }
572
+
573
+ streamingPreview.classList.add('streaming-active');
574
+
575
+ streamingPreview.textContent += previewTokens[previewIndex];
576
+ previewIndex += 1;
577
+ previewTimeoutId = setTimeout(schedulePreviewTick, STREAMING_INTERVAL_MS);
578
+ };
579
+
580
+ const updateStreamingPreview = () => {
581
+ if (!streamingPreview) {
582
+ return;
583
+ }
584
+ clearPreviewTimer();
585
+ previewIndex = 0;
586
+ const source = (promptInput?.value || '').trimEnd();
587
+ streamingPreview.textContent = '';
588
+ previewTokens = source.match(/\S+\s*/g) || [];
589
+ schedulePreviewTick();
590
+ };
591
+
592
+ const clearLogs = () => {
593
+ if (logOutput) {
594
+ logOutput.textContent = '';
595
+ }
596
+ logEntries.length = 0;
597
+ modelGeneratedTotal = 0;
598
+ setModelGenerated(0);
599
+ };
600
+
601
+ const appendLog = (message, timestamp) => {
602
+ if (!logOutput) {
603
+ return;
604
+ }
605
+ const finalTimestamp = timestamp || formatLocalTimestamp();
606
+ const entry = {
607
+ timestamp: finalTimestamp,
608
+ date: parseTimestamp(finalTimestamp),
609
+ message,
610
+ seq: logSequence += 1,
611
+ };
612
+ logEntries.push(entry);
613
+ logEntries.sort((a, b) => {
614
+ const diff = a.date.getTime() - b.date.getTime();
615
+ return diff !== 0 ? diff : a.seq - b.seq;
616
+ });
617
+ if (logEntries.length > 400) {
618
+ logEntries.splice(0, logEntries.length - 400);
619
+ }
620
+ logOutput.textContent = logEntries
621
+ .map(item => `[${item.timestamp}] ${item.message}`)
622
+ .join('\n');
623
+ logOutput.scrollTop = logOutput.scrollHeight;
624
+ };
625
+
626
+ const handleSaveClick = () => {
627
+ if (!recordedSamples) {
628
+ appendLog('[Frontend] Save requested but no audio received yet');
629
+ return;
630
+ }
631
+ const wavBlob = createWavBlob();
632
+ if (!wavBlob) {
633
+ appendLog('[Error] Failed to assemble WAV data for download');
634
+ return;
635
+ }
636
+ revokeDownloadUrl();
637
+ downloadUrl = URL.createObjectURL(wavBlob);
638
+ const link = document.createElement('a');
639
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
640
+ link.href = downloadUrl;
641
+ link.download = `vibevoice_realtime_audio_${timestamp}.wav`;
642
+ document.body.appendChild(link);
643
+ link.click();
644
+ document.body.removeChild(link);
645
+ appendLog('[Frontend] Audio download triggered');
646
+ };
647
+
648
+ const stopPlaybackTimer = () => {
649
+ if (playbackTimer) {
650
+ clearInterval(playbackTimer);
651
+ playbackTimer = null;
652
+ }
653
+ };
654
+
655
+ const startPlaybackTimer = () => {
656
+ stopPlaybackTimer();
657
+ playbackTimer = setInterval(() => {
658
+ setPlaybackElapsed(playbackSamples / SAMPLE_RATE);
659
+ }, 250);
660
+ };
661
+
662
+ const loadVoices = async () => {
663
+ try {
664
+ voiceSelect.disabled = true;
665
+ const response = await fetch('/config');
666
+ if (!response.ok) {
667
+ throw new Error(`Failed to fetch config: ${response.status}`);
668
+ }
669
+ const data = await response.json();
670
+ const voices = Array.isArray(data.voices) ? data.voices : [];
671
+ voiceSelect.innerHTML = '';
672
+ if (voices.length === 0) {
673
+ const option = document.createElement('option');
674
+ option.value = '';
675
+ option.textContent = 'No voices available';
676
+ voiceSelect.appendChild(option);
677
+ voiceSelect.disabled = true;
678
+ appendLog('[Error] No voice presets available');
679
+ return;
680
+ }
681
+
682
+ voices.forEach(voice => {
683
+ const option = document.createElement('option');
684
+ option.value = voice;
685
+ option.textContent = voice;
686
+ voiceSelect.appendChild(option);
687
+ });
688
+
689
+ if (data.default_voice && voices.includes(data.default_voice)) {
690
+ voiceSelect.value = data.default_voice;
691
+ }
692
+ voiceSelect.disabled = false;
693
+ appendLog(`[Frontend] Loaded ${voices.length} voice presets`);
694
+ } catch (err) {
695
+ console.error('Failed to load voices', err);
696
+ voiceSelect.innerHTML = '';
697
+ const option = document.createElement('option');
698
+ option.value = '';
699
+ option.textContent = 'Load failed';
700
+ voiceSelect.appendChild(option);
701
+ voiceSelect.disabled = true;
702
+ appendLog('[Error] Failed to load voice presets');
703
+ }
704
+ };
705
+
706
+ loadVoices();
707
+
708
+ resetBtn.addEventListener('click', () => {
709
+ cfgSelect.value = '1.5';
710
+ stepsSelect.value = '5';
711
+ updateCfgDisplay();
712
+ updateStepsDisplay();
713
+ appendLog('[Frontend] Controls reset to defaults (CFG=1.5, Steps=5)');
714
+ });
715
+
716
+ if (promptInput) {
717
+ promptInput.addEventListener('input', () => {
718
+ if (previewActive) {
719
+ updateStreamingPreview();
720
+ }
721
+ });
722
+ }
723
+
724
+ const handleLogMessage = raw => {
725
+ let payload;
726
+ try {
727
+ payload = JSON.parse(raw);
728
+ } catch (err) {
729
+ appendLog(`[Error] Failed to parse log message: ${raw}`);
730
+ return;
731
+ }
732
+ if (!payload || payload.type !== 'log') {
733
+ appendLog(`[Log] ${raw}`);
734
+ return;
735
+ }
736
+
737
+ const { event, data = {}, timestamp } = payload;
738
+ switch (event) {
739
+ case 'backend_request_received': {
740
+ const cfg = typeof data.cfg_scale === 'number' ? data.cfg_scale.toFixed(3) : data.cfg_scale;
741
+ const steps = data.inference_steps ?? 'default';
742
+ const voice = data.voice || 'default';
743
+ const textLength = data.text_length ?? 0;
744
+ appendLog(`[Backend] Received request`, timestamp);
745
+ break;
746
+ }
747
+ case 'backend_first_chunk_sent':
748
+ appendLog('[Backend] Sent first audio chunk', timestamp);
749
+ break;
750
+ case 'model_progress':
751
+ if (typeof data.generated_sec !== 'undefined') {
752
+ const generated = Number(data.generated_sec);
753
+ if (Number.isFinite(generated)) {
754
+ setModelGenerated(generated);
755
+ }
756
+ }
757
+ return;
758
+ case 'generation_error':
759
+ appendLog(`[Error] Generation error: ${data.message || 'Unknown error'}`, timestamp);
760
+ break;
761
+ case 'backend_error':
762
+ appendLog(`[Error] Backend error: ${data.message || 'Unknown error'}`, timestamp);
763
+ break;
764
+ case 'client_disconnected':
765
+ appendLog('[Frontend] Client disconnected', timestamp);
766
+ break;
767
+ case 'backend_stream_complete':
768
+ appendLog('[Backend] Backend finished', timestamp);
769
+ recordingComplete = true;
770
+ updateSaveButtonState();
771
+ break;
772
+ default:
773
+ appendLog(`[Log] Event ${event}`, timestamp);
774
+ break;
775
+ }
776
+ };
777
+
778
+ const updateButtonLabel = () => {
779
+ controlBtn.textContent = isPlaying ? 'Stop' : 'Start';
780
+ controlBtn.classList.toggle('playing', isPlaying);
781
+ };
782
+
783
+ const appendAudio = chunk => {
784
+ const merged = new Float32Array(buffer.length + chunk.length);
785
+ merged.set(buffer, 0);
786
+ merged.set(chunk, buffer.length);
787
+ buffer = merged;
788
+ };
789
+
790
+ const pullAudio = frameCount => {
791
+ const available = buffer.length;
792
+ if (available === 0) {
793
+ return new Float32Array(frameCount);
794
+ }
795
+ if (available <= frameCount) {
796
+ const chunk = buffer;
797
+ buffer = new Float32Array(0);
798
+ if (chunk.length < frameCount) {
799
+ const padded = new Float32Array(frameCount);
800
+ padded.set(chunk, 0);
801
+ return padded;
802
+ }
803
+ return chunk;
804
+ }
805
+ const chunk = buffer.subarray(0, frameCount);
806
+ buffer = buffer.subarray(frameCount);
807
+ return chunk;
808
+ };
809
+
810
+ const closeSocket = () => {
811
+ if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) {
812
+ socket.close();
813
+ }
814
+ socket = null;
815
+ };
816
+
817
+ const resetPlaybackFlags = (resetSamples = true) => {
818
+ buffer = new Float32Array(0);
819
+ if (resetSamples) {
820
+ playbackSamples = 0;
821
+ setPlaybackElapsed(0);
822
+ }
823
+ hasStartedPlayback = false;
824
+ silentFrameCount = 0;
825
+ firstBrowserChunkLogged = false;
826
+ playbackStartedLogged = false;
827
+ };
828
+
829
+ const teardownAudio = () => {
830
+ if (scriptNode) {
831
+ try { scriptNode.disconnect(); } catch (err) { console.warn('disconnect error', err); }
832
+ scriptNode.onaudioprocess = null;
833
+ }
834
+ if (audioCtx) {
835
+ try { audioCtx.close(); } catch (err) { console.warn('audioCtx.close error', err); }
836
+ }
837
+ audioCtx = null;
838
+ scriptNode = null;
839
+ };
840
+
841
+ const resetState = (resetSamples = true) => {
842
+ closeSocket();
843
+ teardownAudio();
844
+ resetPlaybackFlags(resetSamples);
845
+ isPlaying = false;
846
+ stopPlaybackTimer();
847
+ };
848
+
849
+ const createAudioChain = () => {
850
+ teardownAudio();
851
+ resetPlaybackFlags();
852
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
853
+ scriptNode = audioCtx.createScriptProcessor(BUFFER_SIZE, 0, 1);
854
+
855
+ const minBufferSamples = Math.floor(audioCtx.sampleRate * PREBUFFER_SEC);
856
+
857
+ scriptNode.onaudioprocess = event => {
858
+ const output = event.outputBuffer.getChannelData(0);
859
+ const needPrebuffer = !hasStartedPlayback;
860
+ const socketClosed = !socket || socket.readyState === WebSocket.CLOSED || socket.readyState === WebSocket.CLOSING;
861
+
862
+ if (needPrebuffer) {
863
+ if (buffer.length >= minBufferSamples || socketClosed) {
864
+ hasStartedPlayback = true;
865
+ if (!playbackStartedLogged) {
866
+ playbackStartedLogged = true;
867
+ appendLog('[Frontend] Browser started to play audio');
868
+ startPlaybackTimer();
869
+ }
870
+ } else {
871
+ output.fill(0);
872
+ return;
873
+ }
874
+ }
875
+
876
+ const chunk = pullAudio(output.length);
877
+ output.set(chunk);
878
+
879
+ if (hasStartedPlayback) {
880
+ playbackSamples += output.length;
881
+ }
882
+
883
+ if (socketClosed && buffer.length === 0 && chunk.every(sample => sample === 0)) {
884
+ silentFrameCount += 1;
885
+ if (silentFrameCount >= 4) {
886
+ stop();
887
+ }
888
+ } else {
889
+ silentFrameCount = 0;
890
+ }
891
+ };
892
+
893
+ scriptNode.connect(audioCtx.destination);
894
+ };
895
+
896
+ const start = () => {
897
+ if (isPlaying) {
898
+ return;
899
+ }
900
+
901
+ const textValue = promptInput?.value || '';
902
+ const cfgValue = Number(cfgSelect.value);
903
+ const stepsValue = Number(stepsSelect.value);
904
+ const voiceValue = voiceSelect.value || '';
905
+
906
+ clearLogs();
907
+ const cfgDisplay = Number.isFinite(cfgValue) ? cfgValue.toFixed(3) : 'default';
908
+ const stepsDisplay = Number.isFinite(stepsValue) ? stepsValue : 'default';
909
+ appendLog(`[Frontend] Start button clicked, CFG=${cfgDisplay}, Steps=${stepsDisplay}, Speaker=${voiceValue || 'default'}`);
910
+ setModelGenerated(0);
911
+ setPlaybackElapsed(0);
912
+
913
+ resetState(true);
914
+ clearRecordedChunks();
915
+ isPlaying = true;
916
+ previewActive = true;
917
+ updateStreamingPreview();
918
+ updateButtonLabel();
919
+ createAudioChain();
920
+
921
+ const params = new URLSearchParams();
922
+ params.set('text', textValue);
923
+ if (!Number.isNaN(cfgValue)) {
924
+ params.set('cfg', cfgValue.toFixed(3));
925
+ }
926
+ if (!Number.isNaN(stepsValue)) {
927
+ params.set('steps', stepsValue.toString());
928
+ }
929
+ if (voiceValue) {
930
+ params.set('voice', voiceValue);
931
+ }
932
+ const wsUrl = `${location.origin.replace(/^http/, 'ws')}/stream?${params.toString()}`;
933
+
934
+ socket = new WebSocket(wsUrl);
935
+ socket.binaryType = 'arraybuffer';
936
+
937
+ socket.onmessage = event => {
938
+ if (typeof event.data === 'string') {
939
+ handleLogMessage(event.data);
940
+ return;
941
+ }
942
+
943
+ if (!(event.data instanceof ArrayBuffer)) {
944
+ return;
945
+ }
946
+ const rawBuffer = event.data.slice(0);
947
+ const view = new DataView(rawBuffer);
948
+ const floatChunk = new Float32Array(view.byteLength / 2);
949
+ for (let i = 0; i < floatChunk.length; i += 1) {
950
+ floatChunk[i] = view.getInt16(i * 2, true) / 32768;
951
+ }
952
+ appendAudio(floatChunk);
953
+ recordedChunks.push(rawBuffer);
954
+ recordedSamples += floatChunk.length;
955
+ updateSaveButtonState();
956
+
957
+ if (!firstBrowserChunkLogged) {
958
+ firstBrowserChunkLogged = true;
959
+ appendLog('[Frontend] Received first audio chunk');
960
+ }
961
+ };
962
+
963
+ socket.onerror = err => {
964
+ console.error('WebSocket error', err);
965
+ appendLog(`[Error] WebSocket error: ${err?.message || err}`);
966
+ stop();
967
+ };
968
+
969
+ socket.onclose = () => {
970
+ socket = null;
971
+ if (recordedSamples > 0) {
972
+ recordingComplete = true;
973
+ updateSaveButtonState();
974
+ }
975
+ };
976
+ };
977
+
978
+ const stop = () => {
979
+ if (!isPlaying) {
980
+ resetState(false);
981
+ updateButtonLabel();
982
+ return;
983
+ }
984
+ resetState(false);
985
+ setPlaybackElapsed(Math.min(lastPlaybackElapsed, modelGeneratedTotal));
986
+ appendLog('[Frontend] Playback stopped');
987
+ if (recordedSamples > 0) {
988
+ recordingComplete = true;
989
+ updateSaveButtonState();
990
+ }
991
+ previewActive = false;
992
+ clearPreviewTimer();
993
+ streamingPreview?.classList.remove('streaming-active');
994
+ updateButtonLabel();
995
+ };
996
+
997
+ controlBtn.addEventListener('click', () => {
998
+ if (isPlaying) {
999
+ stop();
1000
+ } else {
1001
+ start();
1002
+ }
1003
+ });
1004
+ if (saveBtn) {
1005
+ saveBtn.addEventListener('click', handleSaveClick);
1006
+ }
1007
+ updateButtonLabel();
1008
+ updateSaveButtonState();
1009
+ window.addEventListener('beforeunload', () => {
1010
+ resetState();
1011
+ clearPreviewTimer();
1012
+ revokeDownloadUrl();
1013
+ });
1014
+ })();
1015
+ </script>
1016
+ </body>
1017
+ </html>
docs/vibevoice-realtime-0.5b.md ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ ## 🎙️ VibeVoice-Realtime: Real-time Long‑Form Text‑to‑Speech with Streaming Input
4
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Collection-orange?logo=huggingface)](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B)
5
+ [![Colab](https://img.shields.io/badge/Run-Colab-orange?logo=googlecolab)](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb)
6
+ </div>
7
+
8
+ VibeVoice-Realtime is a **lightweight real‑time** text-to-speech model supporting **streaming text input** and **robust long-form speech generation**. It can be used to build real-time TTS services, narrate live data streams, and let different LLMs start speaking from their very first tokens (plug in your preferred model) long before a full answer is generated. It produces initial audible speech in **~300 milliseconds** (hardware dependent).
9
+
10
+ <div align="center">
11
+
12
+ | Model | Context Length | Generation Length | Weight |
13
+ |-------|----------------|----------|----------|
14
+ | VibeVoice-Realtime-0.5B | 8K | ~10 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B) |
15
+
16
+ </div>
17
+
18
+ > Note (multilingual exploration): Although the model is primarily built for English, we found that it still exhibits a certain level of multilingual capability—and even performs reasonably well in some languages. We provide nine additional languages (German, French, Italian, Japanese, Korean, Dutch, Polish, Portuguese, and Spanish) for users to explore. These multilingual behaviors have not been extensively tested; use with caution and share observations.
19
+
20
+ The model uses an interleaved, windowed design: it incrementally encodes incoming text chunks while, in parallel, continuing diffusion-based acoustic latent generation from prior context. Unlike the full multi-speaker long-form variants, this streaming model removes the semantic tokenizer and relies solely on an efficient acoustic tokenizer operating at an ultra-low frame rate (7.5 Hz).
21
+
22
+ <div align="center">
23
+ <picture>
24
+ <source media="(prefers-color-scheme: dark)" srcset="../Figures/VibeVoice_logo_white.png">
25
+ <img src="../Figures/VibeVoice_Realtime.png" alt="VibeVoice Realtime Overview" width="800" />
26
+ </picture>
27
+ <br>
28
+ <em>Overview of VibeVoice Realtime Model.</em>
29
+ </div>
30
+
31
+ Key features:
32
+ - Parameter size: 0.5B (deployment-friendly)
33
+ - Real-time TTS (~300 milliseconds first audible latency)
34
+ - Streaming text input
35
+ - Robust long-form speech generation
36
+
37
+ This real-time variant supports only a single speaker. For multi‑speaker conversational speech generation, please use other VibeVoice models (long‑form multi‑speaker variants). The model is currently intended for English speech only; other languages may produce unpredictable results.
38
+
39
+ To mitigate deepfake risks and ensure low latency for the first speech chunk, voice prompts are provided in an embedded format. For users requiring voice customization, please reach out to our team. We will also be expanding the range of available speakers.
40
+
41
+
42
+ ### 📋 TODO
43
+
44
+ - [ ] Add more voices (expand available speakers/voice timbres)
45
+ - [ ] Implement streaming text input function to feed new tokens while audio is still being generated
46
+ - [ ] Merge models into official HuggingFace's `transformers` repository
47
+
48
+
49
+ ### 🎵 Demo Examples
50
+
51
+ <div align="center" id="generated-example-audio-vibevoice-realtime">
52
+
53
+ https://github.com/user-attachments/assets/9aa8ab3c-681d-4a02-b9ea-3f54ffd180b2
54
+
55
+ </div>
56
+
57
+
58
+ ## Results
59
+
60
+ The model achieves satisfactory performance on short-sentence benchmarks, while the model is more focused on long‑form speech generation.
61
+
62
+ ### Zero-shot TTS performance on LibriSpeech test-clean set
63
+
64
+ | Model | WER (%) ↓ | Speaker Similarity ↑ |
65
+ |:--------------------|:---------:|:----------------:|
66
+ | VALL-E 2 | 2.40 | 0.643 |
67
+ | Voicebox | 1.90 | 0.662 |
68
+ | MELLE | 2.10 | 0.625 |
69
+ | **VibeVoice-Realtime-0.5B** | 2.00 | 0.695 |
70
+
71
+ ### Zero-shot TTS performance on SEED test-en set
72
+
73
+ | Model | WER (%) ↓ | Speaker Similarity ↑ |
74
+ |:--------------------|:---------:|:----------------:|
75
+ | MaskGCT | 2.62 | 0.714 |
76
+ | Seed-TTS | 2.25 | 0.762 |
77
+ | FireRedTTS | 3.82 | 0.460 |
78
+ | SparkTTS | 1.98 | 0.584 |
79
+ | CosyVoice2 | 2.57 | 0.652 |
80
+ | **VibeVoice-Realtime-0.5B** | 2.05 | 0.633 |
81
+
82
+
83
+ ## Installation
84
+ We recommend to use NVIDIA Deep Learning Container to manage the CUDA environment.
85
+
86
+ 1. Launch docker
87
+ ```bash
88
+ # NVIDIA PyTorch Container 24.07 / 24.10 / 24.12 verified.
89
+ # Later versions are also compatible.
90
+ sudo docker run --privileged --net=host --ipc=host --ulimit memlock=-1:-1 --ulimit stack=-1:-1 --gpus all --rm -it nvcr.io/nvidia/pytorch:24.07-py3
91
+
92
+ ## If flash attention is not included in your docker environment, you need to install it manually
93
+ ## Refer to https://github.com/Dao-AILab/flash-attention for installation instructions
94
+ # pip install flash-attn --no-build-isolation
95
+ ```
96
+
97
+ 2. Install from github
98
+ ```bash
99
+ git clone https://github.com/microsoft/VibeVoice.git
100
+ cd VibeVoice/
101
+
102
+ pip install -e .
103
+ ```
104
+
105
+ ## Usages
106
+
107
+
108
+ ### Usage 1: Launch real-time websocket demo
109
+ Note: NVIDIA T4 / Mac M4 Pro achieve realtime in our tests; other devices with weaker inference capability may require further testing and speed optimizations.
110
+
111
+ Due to network latency, the time when audio playback is heard may exceed the ~300 ms first speech chunk generation latency.
112
+ ```bash
113
+ python demo/vibevoice_realtime_demo.py --model_path microsoft/VibeVoice-Realtime-0.5B
114
+ ```
115
+
116
+ Tip: Just try it on [Colab](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb).
117
+
118
+ ### Usage 2: Inference from files directly
119
+ ```bash
120
+ # We provide some example scripts under demo/text_examples/ for demo
121
+ python demo/realtime_model_inference_from_file.py --model_path microsoft/VibeVoice-Realtime-0.5B --txt_path demo/text_examples/1p_vibevoice.txt --speaker_name Carter
122
+ ```
123
+
124
+
125
+ ## Risks and limitations
126
+
127
+ While efforts have been made to optimize it through various techniques, it may still produce outputs that are unexpected, biased, or inaccurate. VibeVoice inherits any biases, errors, or omissions produced by its base model (specifically, Qwen2.5 0.5b in this release).
128
+
129
+ Potential for Deepfakes and Disinformation: High-quality synthetic speech can be misused to create convincing fake audio content for impersonation, fraud, or spreading disinformation. Users must ensure transcripts are reliable, check content accuracy, and avoid using generated content in misleading ways. Users are expected to use the generated content and to deploy the models in a lawful manner, in full compliance with all applicable laws and regulations in the relevant jurisdictions. It is best practice to disclose the use of AI when sharing AI-generated content.
130
+
131
+ English only: Transcripts in languages other than English may result in unexpected audio outputs.
132
+
133
+ Non-Speech Audio: The model focuses solely on speech synthesis and does not handle background noise, music, or other sound effects.
134
+
135
+ Code, formulas, and special symbols: The model does not currently support reading code, mathematical formulas, or uncommon symbols. Please pre‑process input text to remove or normalize such content to avoid unpredictable results.
136
+
137
+ Very short inputs: When the input text is extremely short (three words or fewer), the model’s stability may degrade.
138
+
139
+ We do not recommend using VibeVoice in commercial or real-world applications without further testing and development. This model is intended for research and development purposes only. Please use responsibly.
vibevoice/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vibevoice/__init__.py
2
+ from vibevoice.modular import (
3
+ VibeVoiceStreamingForConditionalGenerationInference,
4
+ VibeVoiceStreamingConfig,
5
+ )
6
+ from vibevoice.processor import (
7
+ VibeVoiceStreamingProcessor,
8
+ VibeVoiceTokenizerProcessor,
9
+ )
10
+
11
+ __all__ = [
12
+ "VibeVoiceStreamingForConditionalGenerationInference",
13
+ "VibeVoiceStreamingConfig",
14
+ "VibeVoiceStreamingProcessor",
15
+ "VibeVoiceTokenizerProcessor",
16
+ ]
vibevoice/configs/qwen2.5_1.5b_64k.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "acoustic_vae_dim": 64,
4
+ "acoustic_tokenizer_config": {
5
+ "causal": true,
6
+ "channels": 1,
7
+ "conv_bias": true,
8
+ "conv_norm": "none",
9
+ "corpus_normalize": 0.0,
10
+ "decoder_depths": null,
11
+ "decoder_n_filters": 32,
12
+ "decoder_ratios": [
13
+ 8,
14
+ 5,
15
+ 5,
16
+ 4,
17
+ 2,
18
+ 2
19
+ ],
20
+ "disable_last_norm": true,
21
+ "encoder_depths": "3-3-3-3-3-3-8",
22
+ "encoder_n_filters": 32,
23
+ "encoder_ratios": [
24
+ 8,
25
+ 5,
26
+ 5,
27
+ 4,
28
+ 2,
29
+ 2
30
+ ],
31
+ "fix_std": 0.5,
32
+ "layer_scale_init_value": 1e-06,
33
+ "layernorm": "RMSNorm",
34
+ "layernorm_elementwise_affine": true,
35
+ "layernorm_eps": 1e-05,
36
+ "mixer_layer": "depthwise_conv",
37
+ "model_type": "vibepod_acoustic_tokenizer",
38
+ "pad_mode": "constant",
39
+ "std_dist_type": "gaussian",
40
+ "vae_dim": 64,
41
+ "weight_init_value": 0.01
42
+ },
43
+ "decoder_config": {
44
+ "attention_dropout": 0.0,
45
+ "hidden_act": "silu",
46
+ "hidden_size": 1536,
47
+ "initializer_range": 0.02,
48
+ "intermediate_size": 8960,
49
+ "max_position_embeddings": 65536,
50
+ "max_window_layers": 28,
51
+ "model_type": "qwen2",
52
+ "num_attention_heads": 12,
53
+ "num_hidden_layers": 28,
54
+ "num_key_value_heads": 2,
55
+ "rms_norm_eps": 1e-06,
56
+ "rope_scaling": null,
57
+ "rope_theta": 1000000.0,
58
+ "sliding_window": null,
59
+ "tie_word_embeddings": true,
60
+ "torch_dtype": "bfloat16",
61
+ "use_cache": true,
62
+ "use_sliding_window": false,
63
+ "vocab_size": 151936
64
+ },
65
+ "diffusion_head_config": {
66
+ "ddpm_batch_mul": 4,
67
+ "ddpm_beta_schedule": "cosine",
68
+ "ddpm_num_inference_steps": 20,
69
+ "ddpm_num_steps": 1000,
70
+ "diffusion_type": "ddpm",
71
+ "head_ffn_ratio": 3.0,
72
+ "head_layers": 4,
73
+ "hidden_size": 1536,
74
+ "latent_size": 64,
75
+ "model_type": "vibepod_diffusion_head",
76
+ "prediction_type": "v_prediction",
77
+ "rms_norm_eps": 1e-05,
78
+ "speech_vae_dim": 64
79
+ },
80
+ "model_type": "vibepod",
81
+ "semantic_tokenizer_config": {
82
+ "causal": true,
83
+ "channels": 1,
84
+ "conv_bias": true,
85
+ "conv_norm": "none",
86
+ "corpus_normalize": 0.0,
87
+ "disable_last_norm": true,
88
+ "encoder_depths": "3-3-3-3-3-3-8",
89
+ "encoder_n_filters": 32,
90
+ "encoder_ratios": [
91
+ 8,
92
+ 5,
93
+ 5,
94
+ 4,
95
+ 2,
96
+ 2
97
+ ],
98
+ "fix_std": 0,
99
+ "layer_scale_init_value": 1e-06,
100
+ "layernorm": "RMSNorm",
101
+ "layernorm_elementwise_affine": true,
102
+ "layernorm_eps": 1e-05,
103
+ "mixer_layer": "depthwise_conv",
104
+ "model_type": "vibepod_semantic_tokenizer",
105
+ "pad_mode": "constant",
106
+ "std_dist_type": "none",
107
+ "vae_dim": 128,
108
+ "weight_init_value": 0.01
109
+ },
110
+ "semantic_vae_dim": 128,
111
+ "torch_dtype": "bfloat16"
112
+ }
vibevoice/configs/qwen2.5_7b_32k.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "acoustic_vae_dim": 64,
4
+ "acoustic_tokenizer_config": {
5
+ "causal": true,
6
+ "channels": 1,
7
+ "conv_bias": true,
8
+ "conv_norm": "none",
9
+ "corpus_normalize": 0.0,
10
+ "decoder_depths": null,
11
+ "decoder_n_filters": 32,
12
+ "decoder_ratios": [
13
+ 8,
14
+ 5,
15
+ 5,
16
+ 4,
17
+ 2,
18
+ 2
19
+ ],
20
+ "disable_last_norm": true,
21
+ "encoder_depths": "3-3-3-3-3-3-8",
22
+ "encoder_n_filters": 32,
23
+ "encoder_ratios": [
24
+ 8,
25
+ 5,
26
+ 5,
27
+ 4,
28
+ 2,
29
+ 2
30
+ ],
31
+ "fix_std": 0.5,
32
+ "layer_scale_init_value": 1e-06,
33
+ "layernorm": "RMSNorm",
34
+ "layernorm_elementwise_affine": true,
35
+ "layernorm_eps": 1e-05,
36
+ "mixer_layer": "depthwise_conv",
37
+ "model_type": "vibepod_acoustic_tokenizer",
38
+ "pad_mode": "constant",
39
+ "std_dist_type": "gaussian",
40
+ "vae_dim": 64,
41
+ "weight_init_value": 0.01
42
+ },
43
+ "decoder_config": {
44
+ "attention_dropout": 0.0,
45
+ "hidden_act": "silu",
46
+ "hidden_size": 3584,
47
+ "initializer_range": 0.02,
48
+ "intermediate_size": 18944,
49
+ "max_position_embeddings": 32768,
50
+ "max_window_layers": 28,
51
+ "model_type": "qwen2",
52
+ "num_attention_heads": 28,
53
+ "num_hidden_layers": 28,
54
+ "num_key_value_heads": 4,
55
+ "rms_norm_eps": 1e-06,
56
+ "rope_theta": 1000000.0,
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "torch_dtype": "bfloat16",
60
+ "transformers_version": "4.40.1",
61
+ "use_cache": true,
62
+ "use_mrope": false,
63
+ "use_sliding_window": false,
64
+ "vocab_size": 152064
65
+ },
66
+ "diffusion_head_config": {
67
+ "ddpm_batch_mul": 4,
68
+ "ddpm_beta_schedule": "cosine",
69
+ "ddpm_num_inference_steps": 20,
70
+ "ddpm_num_steps": 1000,
71
+ "diffusion_type": "ddpm",
72
+ "head_ffn_ratio": 3.0,
73
+ "head_layers": 4,
74
+ "hidden_size": 3584,
75
+ "latent_size": 64,
76
+ "model_type": "vibepod_diffusion_head",
77
+ "prediction_type": "v_prediction",
78
+ "rms_norm_eps": 1e-05,
79
+ "speech_vae_dim": 64
80
+ },
81
+ "model_type": "vibepod",
82
+ "semantic_tokenizer_config": {
83
+ "causal": true,
84
+ "channels": 1,
85
+ "conv_bias": true,
86
+ "conv_norm": "none",
87
+ "corpus_normalize": 0.0,
88
+ "disable_last_norm": true,
89
+ "encoder_depths": "3-3-3-3-3-3-8",
90
+ "encoder_n_filters": 32,
91
+ "encoder_ratios": [
92
+ 8,
93
+ 5,
94
+ 5,
95
+ 4,
96
+ 2,
97
+ 2
98
+ ],
99
+ "fix_std": 0,
100
+ "layer_scale_init_value": 1e-06,
101
+ "layernorm": "RMSNorm",
102
+ "layernorm_elementwise_affine": true,
103
+ "layernorm_eps": 1e-05,
104
+ "mixer_layer": "depthwise_conv",
105
+ "model_type": "vibepod_semantic_tokenizer",
106
+ "pad_mode": "constant",
107
+ "std_dist_type": "none",
108
+ "vae_dim": 128,
109
+ "weight_init_value": 0.01
110
+ },
111
+ "semantic_vae_dim": 128,
112
+ "torch_dtype": "bfloat16"
113
+ }
vibevoice/modular/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vibevoice/modular/__init__.py
2
+ from .modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
3
+ from .configuration_vibevoice_streaming import VibeVoiceStreamingConfig
4
+ from .modeling_vibevoice_streaming import VibeVoiceStreamingModel, VibeVoiceStreamingPreTrainedModel
5
+ from .streamer import AudioStreamer, AsyncAudioStreamer
6
+
7
+ __all__ = [
8
+ "VibeVoiceStreamingForConditionalGenerationInference",
9
+ "VibeVoiceStreamingConfig",
10
+ "VibeVoiceStreamingModel",
11
+ "VibeVoiceStreamingPreTrainedModel",
12
+ "AudioStreamer",
13
+ "AsyncAudioStreamer",
14
+ ]
vibevoice/modular/configuration_vibevoice.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ VibeVoice_AcousticTokenizer model configuration"""
2
+
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+
13
+ class VibeVoiceAcousticTokenizerConfig(PretrainedConfig):
14
+ model_type = "vibevoice_acoustic_tokenizer"
15
+
16
+ def __init__(
17
+ self,
18
+ channels: int = 1,
19
+ corpus_normalize: float = 0.0,
20
+ causal: bool = True,
21
+ vae_dim: int = 64,
22
+ fix_std: float = 0.5,
23
+ std_dist_type: str = 'gaussian',
24
+ # common
25
+ mixer_layer: str = 'depthwise_conv',
26
+ conv_norm: str = 'none',
27
+ pad_mode: str = 'constant',
28
+ disable_last_norm: bool = True,
29
+ layernorm: str = 'RMSNorm',
30
+ layernorm_eps: float = 1e-5,
31
+ layernorm_elementwise_affine: bool = True,
32
+ conv_bias: bool = True,
33
+ layer_scale_init_value: float = 1e-6,
34
+ weight_init_value: float = 1e-2,
35
+ # encoder specific
36
+ encoder_n_filters: int = 32,
37
+ encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
38
+ encoder_depths: str = "3-3-3-3-3-3-8",
39
+ # decoder specific
40
+ decoder_n_filters: int = 32,
41
+ decoder_ratios: Optional[List[int]] = None, # if None, same as encoder
42
+ decoder_depths: Optional[str] = None,
43
+ **kwargs
44
+ ):
45
+ super().__init__(**kwargs)
46
+ self.channels = channels
47
+ self.corpus_normalize = corpus_normalize
48
+ self.causal = causal
49
+ self.vae_dim = vae_dim
50
+ self.fix_std = fix_std
51
+ self.std_dist_type = std_dist_type
52
+
53
+ # common parameters
54
+ self.conv_norm = conv_norm
55
+ self.pad_mode = pad_mode
56
+ self.layernorm_eps = layernorm_eps
57
+ self.disable_last_norm = disable_last_norm
58
+ self.layernorm = layernorm
59
+ self.layernorm_elementwise_affine = layernorm_elementwise_affine
60
+ self.conv_bias = conv_bias
61
+ self.layer_scale_init_value = layer_scale_init_value
62
+ self.weight_init_value = weight_init_value
63
+ self.mixer_layer = mixer_layer
64
+
65
+ # encoder specific parameters
66
+ self.encoder_n_filters = encoder_n_filters
67
+ self.encoder_ratios = encoder_ratios
68
+ self.encoder_depths = encoder_depths
69
+
70
+ # decoder specific parameters
71
+ self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios
72
+ self.decoder_n_filters = decoder_n_filters
73
+ self.decoder_depths = decoder_depths
74
+
75
+
76
+ class VibeVoiceSemanticTokenizerConfig(PretrainedConfig):
77
+ model_type = "vibevoice_semantic_tokenizer"
78
+
79
+ def __init__(
80
+ self,
81
+ channels: int = 1,
82
+ corpus_normalize: float = 0.0,
83
+ causal: bool = True,
84
+ vae_dim: int = 64,
85
+ fix_std: float = 0,
86
+ std_dist_type: str = 'none',
87
+ # common
88
+ mixer_layer: str = 'depthwise_conv',
89
+ conv_norm: str = 'none',
90
+ pad_mode: str = 'constant',
91
+ disable_last_norm: bool = True,
92
+ layernorm: str = 'RMSNorm',
93
+ layernorm_eps: float = 1e-5,
94
+ layernorm_elementwise_affine: bool = True,
95
+ conv_bias: bool = True,
96
+ layer_scale_init_value: float = 1e-6,
97
+ weight_init_value: float = 1e-2,
98
+ # encoder specific
99
+ encoder_n_filters: int = 32,
100
+ encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
101
+ encoder_depths: str = "3-3-3-3-3-3-8",
102
+ **kwargs
103
+ ):
104
+ super().__init__(**kwargs)
105
+ self.channels = channels
106
+ self.corpus_normalize = corpus_normalize
107
+ self.causal = causal
108
+ self.vae_dim = vae_dim
109
+ self.fix_std = fix_std
110
+ self.std_dist_type = std_dist_type
111
+
112
+ # common parameters
113
+ self.conv_norm = conv_norm
114
+ self.pad_mode = pad_mode
115
+ self.layernorm_eps = layernorm_eps
116
+ self.disable_last_norm = disable_last_norm
117
+ self.layernorm = layernorm
118
+ self.layernorm_elementwise_affine = layernorm_elementwise_affine
119
+ self.conv_bias = conv_bias
120
+ self.layer_scale_init_value = layer_scale_init_value
121
+ self.weight_init_value = weight_init_value
122
+ self.mixer_layer = mixer_layer
123
+
124
+ # encoder specific parameters
125
+ self.encoder_n_filters = encoder_n_filters
126
+ self.encoder_ratios = encoder_ratios
127
+ self.encoder_depths = encoder_depths
128
+
129
+
130
+ class VibeVoiceDiffusionHeadConfig(PretrainedConfig):
131
+ model_type = "vibevoice_diffusion_head"
132
+
133
+ def __init__(
134
+ self,
135
+ hidden_size=768,
136
+ head_layers=4,
137
+ head_ffn_ratio=3.0,
138
+ rms_norm_eps=1e-5,
139
+ latent_size=64,
140
+ speech_vae_dim=None,
141
+ prediction_type="v_prediction",
142
+ diffusion_type="ddpm",
143
+ ddpm_num_steps=1000,
144
+ ddpm_num_inference_steps=20,
145
+ ddpm_beta_schedule="cosine",
146
+ ddpm_batch_mul=4,
147
+ **kwargs
148
+ ):
149
+ self.hidden_size = hidden_size
150
+ self.head_layers = head_layers
151
+ self.head_ffn_ratio = head_ffn_ratio
152
+ self.rms_norm_eps = rms_norm_eps
153
+ self.latent_size = latent_size
154
+ self.speech_vae_dim = speech_vae_dim
155
+ self.prediction_type = prediction_type
156
+ self.diffusion_type = diffusion_type
157
+ self.ddpm_num_steps = ddpm_num_steps
158
+ self.ddpm_num_inference_steps = ddpm_num_inference_steps
159
+ self.ddpm_beta_schedule = ddpm_beta_schedule
160
+ self.ddpm_batch_mul = ddpm_batch_mul
161
+
162
+ super().__init__(**kwargs)
163
+
164
+ class VibeVoiceConfig(PretrainedConfig):
165
+ model_type = "vibevoice"
166
+ is_composition = True
167
+ sub_configs = {
168
+ "acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig,
169
+ "semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig,
170
+ "decoder_config": Qwen2Config,
171
+ "diffusion_head_config": VibeVoiceDiffusionHeadConfig,
172
+ }
173
+ # keys_to_ignore_at_inference = ["past_key_values"]
174
+ # Default tensor parallel plan for base model `Qwen2`
175
+ base_model_tp_plan = {
176
+ "layers.*.self_attn.q_proj": "colwise",
177
+ "layers.*.self_attn.k_proj": "colwise",
178
+ "layers.*.self_attn.v_proj": "colwise",
179
+ "layers.*.self_attn.o_proj": "rowwise",
180
+ "layers.*.mlp.gate_proj": "colwise",
181
+ "layers.*.mlp.up_proj": "colwise",
182
+ "layers.*.mlp.down_proj": "rowwise",
183
+ }
184
+
185
+ def __init__(
186
+ self,
187
+ acoustic_tokenizer_config=None,
188
+ semantic_tokenizer_config=None,
189
+ decoder_config=None,
190
+ diffusion_head_config=None,
191
+ **kwargs
192
+ ):
193
+
194
+ # kwargs["_attn_implementation"] = "flash_attention_2"
195
+ kwargs["_attn_implementation_autoset"] = False
196
+
197
+ if acoustic_tokenizer_config is None:
198
+ self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]()
199
+ elif isinstance(acoustic_tokenizer_config, dict):
200
+ acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer"
201
+ self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config)
202
+ elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig):
203
+ # If an instance of the config class is provided
204
+ self.acoustic_tokenizer_config = acoustic_tokenizer_config
205
+
206
+ if semantic_tokenizer_config is None:
207
+ self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]()
208
+ elif isinstance(semantic_tokenizer_config, dict):
209
+ semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer"
210
+ self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config)
211
+ elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig):
212
+ # If an instance of the config class is provided
213
+ self.semantic_tokenizer_config = semantic_tokenizer_config
214
+
215
+ if decoder_config is None:
216
+ self.decoder_config = self.sub_configs["decoder_config"]()
217
+ elif isinstance(decoder_config, dict):
218
+ # If a dictionary is provided, instantiate the config class with it
219
+ # self.decoder_config = self.sub_configs["decoder_config"](**decoder_config)
220
+ if decoder_config.get("model_type", '') == "qwen2":
221
+ self.decoder_config = Qwen2Config(**decoder_config)
222
+ else:
223
+ raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}")
224
+ elif isinstance(decoder_config, (Qwen2Config,)):
225
+ # If an instance of the config class is provided
226
+ self.decoder_config = decoder_config
227
+
228
+ if diffusion_head_config is None:
229
+ self.diffusion_head_config = self.sub_configs["diffusion_head_config"]()
230
+ elif isinstance(diffusion_head_config, dict):
231
+ diffusion_head_config["model_type"] = "vibevoice_diffusion_head"
232
+ self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config)
233
+ elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig):
234
+ # If an instance of the config class is provided
235
+ self.diffusion_head_config = diffusion_head_config
236
+
237
+ # other parameters
238
+ self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64)
239
+ self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128)
240
+
241
+ super().__init__(**kwargs)
242
+
243
+ __all__ = [
244
+ "VibeVoiceAcousticTokenizerConfig",
245
+ "VibeVoiceSemanticTokenizerConfig",
246
+ "VibeVoiceDiffusionHeadConfig",
247
+ "VibeVoiceConfig"
248
+ ]
vibevoice/modular/configuration_vibevoice_streaming.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ VibeVoice Streaming model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
7
+
8
+ from .configuration_vibevoice import VibeVoiceAcousticTokenizerConfig, VibeVoiceDiffusionHeadConfig
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+
13
+ class VibeVoiceStreamingConfig(PretrainedConfig):
14
+ model_type = "vibevoice_streaming"
15
+ is_composition = True
16
+ sub_configs = {
17
+ "acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig,
18
+ "decoder_config": Qwen2Config,
19
+ "diffusion_head_config": VibeVoiceDiffusionHeadConfig,
20
+ }
21
+ # keys_to_ignore_at_inference = ["past_key_values"]
22
+ # Default tensor parallel plan for base model `Qwen2`
23
+ base_model_tp_plan = {
24
+ "layers.*.self_attn.q_proj": "colwise",
25
+ "layers.*.self_attn.k_proj": "colwise",
26
+ "layers.*.self_attn.v_proj": "colwise",
27
+ "layers.*.self_attn.o_proj": "rowwise",
28
+ "layers.*.mlp.gate_proj": "colwise",
29
+ "layers.*.mlp.up_proj": "colwise",
30
+ "layers.*.mlp.down_proj": "rowwise",
31
+ }
32
+
33
+ def __init__(
34
+ self,
35
+ acoustic_tokenizer_config=None,
36
+ decoder_config=None,
37
+ diffusion_head_config=None,
38
+ tts_backbone_num_hidden_layers=20,
39
+ **kwargs
40
+ ):
41
+
42
+ # kwargs["_attn_implementation"] = "flash_attention_2"
43
+ kwargs["_attn_implementation_autoset"] = False
44
+
45
+ if acoustic_tokenizer_config is None:
46
+ self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]()
47
+ elif isinstance(acoustic_tokenizer_config, dict):
48
+ acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer"
49
+ self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config)
50
+ elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig):
51
+ # If an instance of the config class is provided
52
+ self.acoustic_tokenizer_config = acoustic_tokenizer_config
53
+
54
+ if decoder_config is None:
55
+ self.decoder_config = self.sub_configs["decoder_config"]()
56
+ elif isinstance(decoder_config, dict):
57
+ # If a dictionary is provided, instantiate the config class with it
58
+ # self.decoder_config = self.sub_configs["decoder_config"](**decoder_config)
59
+ if decoder_config.get("model_type", '') == "qwen2":
60
+ self.decoder_config = Qwen2Config(**decoder_config)
61
+ else:
62
+ raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}")
63
+ elif isinstance(decoder_config, (Qwen2Config,)):
64
+ # If an instance of the config class is provided
65
+ self.decoder_config = decoder_config
66
+
67
+ if diffusion_head_config is None:
68
+ self.diffusion_head_config = self.sub_configs["diffusion_head_config"]()
69
+ elif isinstance(diffusion_head_config, dict):
70
+ diffusion_head_config["model_type"] = "vibevoice_diffusion_head"
71
+ self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config)
72
+ elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig):
73
+ # If an instance of the config class is provided
74
+ self.diffusion_head_config = diffusion_head_config
75
+
76
+ # other parameters
77
+ self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64)
78
+ # The decoder of the model is divided into two components. The lower Transformer layers are only used for encoding text, while the upper Transformer layers are used for encoding text and generating speech. `tts_backbone_num_hidden_layers` indicates the number of upper layers used for TTS.
79
+ self.tts_backbone_num_hidden_layers = tts_backbone_num_hidden_layers
80
+
81
+ super().__init__(**kwargs)
82
+
83
+ __all__ = [
84
+ "VibeVoiceStreamingConfig"
85
+ ]
vibevoice/modular/modeling_vibevoice_streaming.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Tuple, Union, Callable
3
+ from tqdm import tqdm
4
+ import copy
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ import torch.distributed as dist
9
+
10
+ from transformers.models.auto import AutoModel, AutoModelForCausalLM
11
+
12
+ from transformers.activations import ACT2FN
13
+ from transformers.modeling_outputs import CausalLMOutput, BaseModelOutputWithPast, ModelOutput
14
+ from transformers.models.llama.modeling_llama import LlamaRMSNorm
15
+ from transformers import modeling_utils
16
+ from transformers.modeling_utils import PreTrainedModel
17
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
18
+ from transformers.utils import logging
19
+
20
+ from .modular_vibevoice_diffusion_head import VibeVoiceDiffusionHead
21
+ from vibevoice.schedule.dpm_solver import DPMSolverMultistepScheduler
22
+
23
+ from .configuration_vibevoice_streaming import VibeVoiceStreamingConfig
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
29
+ modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"]
30
+
31
+
32
+ class BinaryClassifier(nn.Module):
33
+ def __init__(self, hidden_size):
34
+ super(BinaryClassifier, self).__init__()
35
+ self.fc1 = nn.Linear(hidden_size, hidden_size)
36
+ self.fc2 = nn.Linear(hidden_size, 1)
37
+
38
+ def forward(self, x):
39
+ x = torch.relu(self.fc1(x))
40
+ x = self.fc2(x)
41
+ return x
42
+
43
+
44
+ class SpeechConnector(nn.Module):
45
+ def __init__(self, input_dim, output_dim):
46
+ super().__init__()
47
+ self.fc1 = nn.Linear(input_dim, output_dim)
48
+ self.norm = LlamaRMSNorm(output_dim, eps=1e-6)
49
+ self.fc2 = nn.Linear(output_dim, output_dim)
50
+
51
+ def forward(self, features, **kwargs):
52
+ x = self.fc1(features)
53
+ x = self.norm(x)
54
+ x = self.fc2(x)
55
+ return x
56
+
57
+
58
+ # @auto_docstring
59
+ class VibeVoiceStreamingPreTrainedModel(PreTrainedModel):
60
+ config_class = VibeVoiceStreamingConfig
61
+ base_model_prefix = "model"
62
+ supports_gradient_checkpointing = True
63
+ _skip_keys_device_placement = "past_key_values"
64
+ _supports_cache_class = True
65
+ _supports_flash_attn_2 = True
66
+ _supports_sdpa = True
67
+ _supports_quantized_cache = True
68
+ _supports_static_cache = True
69
+ _supports_attention_backend = True
70
+
71
+ def _init_weights(self, module):
72
+ if isinstance(module, VibeVoiceDiffusionHead):
73
+ module.initialize_weights()
74
+ return
75
+
76
+ # Use the language model's initializer_range if available
77
+ if hasattr(self.config, 'language_model_config') and hasattr(self.config.language_model_config, 'initializer_range'):
78
+ std = self.config.language_model_config.initializer_range
79
+ elif hasattr(self.config, 'decoder_config') and hasattr(self.config.decoder_config, 'initializer_range'):
80
+ std = self.config.decoder_config.initializer_range
81
+ else:
82
+ std = 0.02 # Default value
83
+
84
+ if isinstance(module, nn.Linear):
85
+ module.weight.data.normal_(mean=0.0, std=std)
86
+ if module.bias is not None:
87
+ module.bias.data.zero_()
88
+ elif isinstance(module, nn.LayerNorm):
89
+ module.weight.data.fill_(1.0)
90
+ module.bias.data.zero_()
91
+
92
+
93
+ # @auto_docstring
94
+ class VibeVoiceStreamingModel(VibeVoiceStreamingPreTrainedModel):
95
+ def __init__(self, config):
96
+ super().__init__(config)
97
+
98
+ if hasattr(config, 'torch_dtype') and config.torch_dtype is not None:
99
+ if isinstance(config.torch_dtype, str):
100
+ dtype = getattr(torch, config.torch_dtype)
101
+ else:
102
+ dtype = config.torch_dtype
103
+ else:
104
+ dtype = torch.float32
105
+
106
+ # Initialize Qwen2 model for language modeling.
107
+ # The lower Transformer layers are only used for encoding text, while the upper Transformer layers are used for encoding text and generating speech.
108
+ # To keep the code clean, we constructs two language models.
109
+ # The final norm layer of the first language_model is set to identity and will not be used in inference.
110
+ lm_config = copy.deepcopy(config.decoder_config)
111
+ lm_backbone_num_hidden_layers = getattr(lm_config, 'num_hidden_layers', 24) - config.tts_backbone_num_hidden_layers
112
+ lm_config.num_hidden_layers = lm_backbone_num_hidden_layers
113
+ self.language_model = AutoModel.from_config(lm_config)
114
+ self.language_model.norm = nn.Identity()
115
+
116
+ # We only need the Transformer layers here. Note that embed_tokens in tts_language_model is unused
117
+ tts_lm_config = copy.deepcopy(lm_config)
118
+ tts_lm_config.num_hidden_layers = config.tts_backbone_num_hidden_layers
119
+ self.tts_language_model = AutoModel.from_config(tts_lm_config)
120
+
121
+ # Marks the text that needs to be spoken by the TTS model.
122
+ self.tts_input_types = nn.Embedding(num_embeddings=2, embedding_dim=config.decoder_config.hidden_size)
123
+
124
+ # Initialize speech components if needed
125
+ self.acoustic_tokenizer = AutoModel.from_config(config.acoustic_tokenizer_config).to(dtype)
126
+ self.acoustic_connector = SpeechConnector(config.acoustic_vae_dim, lm_config.hidden_size).to(dtype)
127
+
128
+ # Register scaling factors as buffers - use 1D tensors for FSDP compatibility
129
+ self.register_buffer('speech_scaling_factor', torch.tensor(float('nan')))
130
+ self.register_buffer('speech_bias_factor', torch.tensor(float('nan')))
131
+
132
+ # Initialize prediction head for speech generation
133
+ self.prediction_head = AutoModel.from_config(config.diffusion_head_config).to(dtype)
134
+
135
+ # Initialize noise scheduler
136
+ self.noise_scheduler = DPMSolverMultistepScheduler(
137
+ num_train_timesteps=config.diffusion_head_config.ddpm_num_steps,
138
+ beta_schedule=config.diffusion_head_config.ddpm_beta_schedule,
139
+ prediction_type=config.diffusion_head_config.prediction_type
140
+ )
141
+
142
+ def get_input_embeddings(self):
143
+ if hasattr(self.language_model, 'embed_tokens'):
144
+ # If the language model has an embed_tokens attribute, return it
145
+ return self.language_model.embed_tokens
146
+
147
+ for name, attr in self.language_model.fullmap.items(): # parallel by nnscaler, the name is changed
148
+ if attr.orig_name == 'embed_tokens.weight':
149
+ return getattr(self.language_model, name)
150
+ assert False, 'should not arrive here'
151
+
152
+ def set_input_embeddings(self, value):
153
+ self.language_model.embed_tokens = value
154
+
155
+ def set_speech_tokenizers(self, acoustic_tokenizer=None):
156
+ """Set the speech tokenizers used for encoding and decoding speech."""
157
+ self.acoustic_tokenizer = acoustic_tokenizer
158
+
159
+ # Reset the encoder to evaluation mode
160
+ if self.acoustic_tokenizer is not None:
161
+ self.acoustic_tokenizer.eval()
162
+
163
+ def forward(self, *args, **kwargs):
164
+ """
165
+ Intentionally not implemented.
166
+
167
+ This streaming model is split into two explicit submodules:
168
+ - `language_model` for plain text processing (lower layers).
169
+ - `tts_language_model` for TTS-related upper layers.
170
+
171
+ We deliberately avoid a unified `forward` to prevent accidental calls
172
+ that mix responsibilities.
173
+
174
+ To use the model:
175
+ - Call `self.language_model(...)` for text embeddings / hidden states.
176
+ - Call `self.tts_language_model(...)` for the TTS portion.
177
+ - Use the dedicated inference class for combined generation logic.
178
+ """
179
+ raise RuntimeError(
180
+ "VibeVoiceStreamingModel.forward is intentionally disabled. "
181
+ "Use `model.language_model(...)` or `model.tts_language_model(...)` instead."
182
+ )
183
+
184
+
185
+ AutoModel.register(VibeVoiceStreamingConfig, VibeVoiceStreamingModel)
186
+
187
+ __all__ = [
188
+ "VibeVoiceStreamingPreTrainedModel",
189
+ "VibeVoiceStreamingModel",
190
+ ]