Text-to-Speech
Safetensors
Vietnamese
qwen2
geopromini commited on
Commit
cfbc436
·
verified ·
1 Parent(s): 2ad2cc8

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,38 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
38
+ VieNeu-TTS-F16.gguf filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - pnnbao-ump/VieNeu-TTS-1000h
5
+ - pnnbao-ump/VieNeuCodec-dataset
6
+ - pnnbao-ump/VieNeu-TTS-140h
7
+ language:
8
+ - vi
9
+ base_model:
10
+ - neuphonic/neutts-air
11
+ pipeline_tag: text-to-speech
12
+ ---
13
+
14
+ # VieNeu-TTS
15
+
16
+ [![GitHub](https://img.shields.io/badge/GitHub-Repository-blue)](https://github.com/pnnbao97/VieNeu-TTS)
17
+ [![Model](https://img.shields.io/badge/Hugging%20Face-Model-yellow)](https://huggingface.co/pnnbao-ump/VieNeu-TTS)
18
+
19
+
20
+ ![Untitled](https://cdn-uploads.huggingface.co/production/uploads/68b923a86c86c127a1975eda/vd7kW8h7ooSafcIhEQtyr.png)
21
+
22
+ ## Overview
23
+
24
+ **VieNeu-TTS** is an advanced on-device Vietnamese Text-to-Speech (TTS) model with **instant voice cloning**.
25
+
26
+ Trained on ~1000 hours of high-quality Vietnamese speech, this model represents a significant upgrade from VieNeu-TTS-140h with the following improvements:
27
+
28
+ - **Enhanced pronunciation**: More accurate and stable Vietnamese pronunciation
29
+ - **Code-switching support**: Seamless transitions between Vietnamese and English
30
+ - **Better voice cloning**: Higher fidelity and speaker consistency
31
+ - **Real-time synthesis**: 24 kHz waveform generation on CPU or GPU
32
+
33
+ VieNeu-TTS-1000h delivers production-ready speech synthesis fully offline.
34
+
35
+ **Author:** Phạm Nguyễn Ngọc Bảo
36
+
37
+ ## Support This Project
38
+
39
+ Training high-quality TTS models requires significant GPU resources and compute time. If you find this model useful, please consider supporting the development:
40
+
41
+ [![Buy Me a Coffee](https://img.shields.io/badge/Buy%20Me%20a%20Coffee-Support-orange?logo=buy-me-a-coffee)](https://buymeacoffee.com/pnnbao)
42
+
43
+ Your support helps maintain and improve VieNeu-TTS! 🙏
44
+
45
+ ---
46
+
47
+ ## Voice Cloning Inference
48
+
49
+ **Reference Voice (Speaker Example):**
50
+ <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/68b923a86c86c127a1975eda/Rpw1V6X1px59SWQKn_W9D.wav"></audio>
51
+
52
+ **Input Text:**
53
+ > Trên bầu trời xanh thẳm, những đám mây trắng lửng lờ trôi như những chiếc thuyền nhỏ đang lướt nhẹ theo dòng gió. Dưới mặt đất, cánh đồng lúa vàng rực trải dài tới tận chân trời, những bông lúa nghiêng mình theo từng làn gió.
54
+
55
+ **Generated Output (Cloned Voice):**
56
+ <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/68b923a86c86c127a1975eda/f40t4ueGqmsGDmNIGcU3J.mpga"></audio>
57
+
58
+ ---
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ git clone https://github.com/pnnbao97/VieNeu-TTS.git
64
+ cd VieNeu-TTS
65
+ uv sync
66
+ ```
67
+
68
+ ## Gradio Demo
69
+
70
+ ```bash
71
+ uv run gradio_app.py
72
+ ```
73
+ Open your browser at `http://127.0.0.1:7860`.
74
+
75
+ **Demo Video:**
76
+
77
+
78
+ <video controls src="https://cdn-uploads.huggingface.co/production/uploads/68b923a86c86c127a1975eda/pRsUExICceCh47dgu4P8I.mp4" width="100%"></video>
79
+
80
+ ## Reference Voices
81
+
82
+ | File | Gender | Accent | Description |
83
+ |-------------------------|--------|--------|--------------------|
84
+ | Bình (nam miền Bắc) | Male | North | Male voice, North accent |
85
+ | Tuyên (nam miền Bắc) | Male | North | Male voice, North accent |
86
+ | Nguyên (nam miền Nam) | Male | South | Male voice, South accent |
87
+ | Sơn (nam miền Nam) | Male | South | Male voice, South accent |
88
+ | Vĩnh (nam miền Nam) | Male | South | Male voice, South accent |
89
+ | Hương (nữ miền Bắc) | Female | North | Female voice, North accent |
90
+ | Ly (nữ miền Bắc) | Female | North | Female voice, North accent |
91
+ | Ngọc (nữ miền Bắc) | Female | North | Female voice, North accent |
92
+ | Đoan (nữ miền Nam) | Female | South | Female voice, South accent |
93
+ | Dung (nữ miền Nam) | Female | South | Female voice, South accent |
94
+
95
+ ---
96
+
97
+ ## Model Architecture
98
+
99
+ | Component | Description |
100
+ |----------|-------------|
101
+ | Backbone | Qwen 0.5B (chat-format LM) |
102
+ | Codec | NeuCodec (supports ONNX + quantization) |
103
+ | Output | 24 kHz waveform synthesis |
104
+ | Context Window | 2048 tokens shared text + speech |
105
+ | Watermark | Enabled |
106
+ | Training Data | VieNeuCodec-dataset + Emilia dataset pretraining |
107
+
108
+ ## Features
109
+
110
+ - High-quality Vietnamese speech
111
+ - Instant **voice cloning** (3–5 second reference audio)
112
+ - Fully **offline**
113
+ - Runs real-time or faster
114
+ - Multi-voice reference support
115
+ - Python API + CLI + Gradio
116
+
117
+ ## Troubleshooting
118
+
119
+ | Issue | Cause | Solution |
120
+ |------|-------|----------|
121
+ | Missing `libespeak` | System dependency | Install eSpeak NG |
122
+ | GPU OOM | VRAM too small | Use CPU or quantized model |
123
+ | Poor voice match | Bad reference sample | Try a clearer reference clip |
124
+
125
+ ## License
126
+
127
+ Apache 2.0
128
+
129
+ ## Citation
130
+
131
+ ```bibtex
132
+ @misc{vieneutts2025,
133
+ title = {VieNeu-TTS: Vietnamese Text-to-Speech with Instant Voice Cloning},
134
+ author = {Pham Nguyen Ngoc Bao},
135
+ year = {2025},
136
+ publisher = {Hugging Face},
137
+ howpublished = {\url{https://huggingface.co/pnnbao-ump/VieNeu-TTS}}
138
+ }
139
+ ```
140
+
141
+ Please also cite the base model:
142
+
143
+ ```bibtex
144
+ @misc{neuttsair2025,
145
+ title = {NeuTTS Air: On-Device Speech Language Model with Instant Voice Cloning},
146
+ author = {Neuphonic},
147
+ year = {2025},
148
+ publisher = {Hugging Face},
149
+ howpublished = {\url{https://huggingface.co/neuphonic/neutts-air}}
150
+ }
151
+ ```
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "max_window_layers": 21,
40
+ "model_type": "qwen2",
41
+ "num_attention_heads": 14,
42
+ "num_hidden_layers": 24,
43
+ "num_key_value_heads": 2,
44
+ "pad_token_id": 151645,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_scaling": null,
47
+ "rope_theta": 1000000.0,
48
+ "sliding_window": null,
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "4.56.0",
51
+ "use_cache": true,
52
+ "use_sliding_window": false,
53
+ "vocab_size": 217652
54
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151645,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.56.0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e46be7e7cea1a3431e55f2f2c988e5e531724ec63c1e585079126567fa4d5633
3
+ size 1105860992
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74c466530bd698626a5b6a424d204711c58dfff0a6b3dd8b4dbac1e1e8c9aa87
3
+ size 24140239
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058efca33cdb0e871f3e4f46c162cf3231e709129c0a024fb83317bffccd32f0
3
+ size 12063181
vocab.json ADDED
The diff for this file is too large to render. See raw diff