valtecAI-team commited on
Commit
2ef8d92
·
verified ·
1 Parent(s): 60f7608

Upload folder using huggingface_hub

Browse files
valtec_tts/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Valtec Vietnamese TTS - Text to Speech for Vietnamese
3
+
4
+ Simple usage:
5
+ from valtec_tts import TTS
6
+
7
+ tts = TTS()
8
+ tts.speak("Xin chào các bạn", output_path="output.wav")
9
+ """
10
+
11
+ __version__ = "1.0.0"
12
+ __author__ = "Valtec Team"
13
+
14
+ from .tts import TTS
15
+
16
+ __all__ = ["TTS", "__version__"]
valtec_tts/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (476 Bytes). View file
 
valtec_tts/__pycache__/tts.cpython-310.pyc ADDED
Binary file (6.97 kB). View file
 
valtec_tts/tts.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Valtec TTS - Simple Vietnamese Text-to-Speech API
3
+
4
+ Usage:
5
+ from valtec_tts import TTS
6
+
7
+ tts = TTS()
8
+ tts.speak("Xin chào các bạn", output_path="output.wav")
9
+
10
+ # Or get audio directly
11
+ audio, sr = tts.synthesize("Xin chào")
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Optional, Tuple, Union
18
+ import json
19
+
20
+ import numpy as np
21
+
22
+ # Hugging Face Hub for model download
23
+ try:
24
+ from huggingface_hub import hf_hub_download, snapshot_download
25
+ HF_HUB_AVAILABLE = True
26
+ except ImportError:
27
+ HF_HUB_AVAILABLE = False
28
+
29
+
30
+ # Default model repository on Hugging Face
31
+ DEFAULT_HF_REPO = "valtecAI-team/valtec-tts-pretrained"
32
+ DEFAULT_MODEL_NAME = "vits-vietnamese"
33
+
34
+ # Local cache directory
35
+ def get_cache_dir() -> Path:
36
+ """Get the cache directory for storing models."""
37
+ # Use standard cache locations
38
+ if os.name == 'nt': # Windows
39
+ cache_base = Path(os.environ.get('LOCALAPPDATA', Path.home() / 'AppData' / 'Local'))
40
+ else: # Linux/Mac
41
+ cache_base = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache'))
42
+
43
+ cache_dir = cache_base / 'valtec_tts' / 'models'
44
+ cache_dir.mkdir(parents=True, exist_ok=True)
45
+ return cache_dir
46
+
47
+
48
+ class TTS:
49
+ """
50
+ Simple Vietnamese Text-to-Speech interface.
51
+
52
+ Example:
53
+ tts = TTS()
54
+ tts.speak("Xin chào", output_path="hello.wav")
55
+
56
+ # Or get audio array
57
+ audio, sr = tts.synthesize("Xin chào")
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ model_path: Optional[str] = None,
63
+ device: str = "auto",
64
+ hf_repo: str = DEFAULT_HF_REPO,
65
+ ):
66
+ """
67
+ Initialize TTS engine.
68
+
69
+ Args:
70
+ model_path: Path to local model directory. If None, auto-downloads from Hugging Face.
71
+ device: Device to use ('cuda', 'cpu', or 'auto' for automatic detection).
72
+ hf_repo: Hugging Face repository ID for model download.
73
+ """
74
+ self.hf_repo = hf_repo
75
+
76
+ # Determine device
77
+ if device == "auto":
78
+ import torch
79
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
80
+ else:
81
+ self.device = device
82
+
83
+ # Get model path
84
+ if model_path is None:
85
+ model_path = self._ensure_model_available()
86
+
87
+ self.model_path = Path(model_path)
88
+ self._engine = None
89
+ self._load_model()
90
+
91
+ def _ensure_model_available(self) -> str:
92
+ """Ensure model is available locally, download if not."""
93
+ cache_dir = get_cache_dir()
94
+ model_dir = cache_dir / DEFAULT_MODEL_NAME
95
+ config_path = model_dir / "config.json"
96
+
97
+ # Check if model already exists
98
+ if config_path.exists():
99
+ # Find checkpoint
100
+ checkpoints = list(model_dir.glob("G_*.pth"))
101
+ if checkpoints:
102
+ print(f"Using cached model from: {model_dir}")
103
+ return str(model_dir)
104
+
105
+ # Need to download
106
+ print(f"Model not found locally. Downloading from Hugging Face: {self.hf_repo}")
107
+ return self._download_model(model_dir)
108
+
109
+ def _download_model(self, target_dir: Path) -> str:
110
+ """Download model from Hugging Face Hub."""
111
+ if not HF_HUB_AVAILABLE:
112
+ raise RuntimeError(
113
+ "huggingface_hub is required for auto-download. "
114
+ "Install with: pip install huggingface_hub"
115
+ )
116
+
117
+ target_dir.mkdir(parents=True, exist_ok=True)
118
+
119
+ try:
120
+ # Download entire model directory
121
+ print(f"Downloading model to: {target_dir}")
122
+ snapshot_download(
123
+ repo_id=self.hf_repo,
124
+ local_dir=str(target_dir),
125
+ local_dir_use_symlinks=False,
126
+ )
127
+ print("Download complete!")
128
+ return str(target_dir)
129
+
130
+ except Exception as e:
131
+ raise RuntimeError(
132
+ f"Failed to download model from {self.hf_repo}: {e}\n"
133
+ "Please check your internet connection or provide a local model_path."
134
+ )
135
+
136
+ def _load_model(self):
137
+ """Load the TTS model."""
138
+ # Add parent directory to path for imports
139
+ package_root = Path(__file__).parent.parent
140
+ if str(package_root) not in sys.path:
141
+ sys.path.insert(0, str(package_root))
142
+
143
+ from infer import VietnameseTTS, find_latest_checkpoint
144
+
145
+ # Find checkpoint and config
146
+ checkpoint = find_latest_checkpoint(str(self.model_path), "G")
147
+ config_path = self.model_path / "config.json"
148
+
149
+ if checkpoint is None:
150
+ raise FileNotFoundError(f"No checkpoint found in {self.model_path}")
151
+ if not config_path.exists():
152
+ raise FileNotFoundError(f"config.json not found in {self.model_path}")
153
+
154
+ print(f"Loading model from: {checkpoint}")
155
+ self._engine = VietnameseTTS(checkpoint, str(config_path), self.device)
156
+
157
+ # Store speakers
158
+ self.speakers = self._engine.speakers
159
+ self.default_speaker = self.speakers[0] if self.speakers else None
160
+ print(f"Available speakers: {self.speakers}")
161
+
162
+ def synthesize(
163
+ self,
164
+ text: str,
165
+ speaker: Optional[str] = None,
166
+ speed: float = 1.0,
167
+ noise_scale: float = 0.667,
168
+ noise_scale_w: float = 0.8,
169
+ sdp_ratio: float = 0.0,
170
+ ) -> Tuple[np.ndarray, int]:
171
+ """
172
+ Synthesize speech from text.
173
+
174
+ Args:
175
+ text: Vietnamese text to synthesize.
176
+ speaker: Speaker name. Uses default if not specified.
177
+ speed: Speech speed (1.0 = normal, < 1.0 = faster, > 1.0 = slower).
178
+ noise_scale: Controls voice variability.
179
+ noise_scale_w: Controls duration variability.
180
+ sdp_ratio: Stochastic duration predictor ratio (0 = deterministic).
181
+
182
+ Returns:
183
+ Tuple of (audio_array, sample_rate)
184
+ """
185
+ if self._engine is None:
186
+ raise RuntimeError("Model not loaded")
187
+
188
+ speaker = speaker or self.default_speaker
189
+
190
+ audio, sr = self._engine.synthesize(
191
+ text=text,
192
+ speaker=speaker,
193
+ length_scale=speed,
194
+ noise_scale=noise_scale,
195
+ noise_scale_w=noise_scale_w,
196
+ sdp_ratio=sdp_ratio,
197
+ )
198
+
199
+ return audio, sr
200
+
201
+ def speak(
202
+ self,
203
+ text: str,
204
+ output_path: str = "output.wav",
205
+ speaker: Optional[str] = None,
206
+ speed: float = 1.0,
207
+ play: bool = False,
208
+ **kwargs
209
+ ) -> str:
210
+ """
211
+ Synthesize and save speech to file.
212
+
213
+ Args:
214
+ text: Vietnamese text to synthesize.
215
+ output_path: Path to save the audio file.
216
+ speaker: Speaker name. Uses default if not specified.
217
+ speed: Speech speed (1.0 = normal).
218
+ play: If True, attempt to play the audio (requires sounddevice).
219
+ **kwargs: Additional arguments passed to synthesize().
220
+
221
+ Returns:
222
+ Path to the saved audio file.
223
+ """
224
+ audio, sr = self.synthesize(text, speaker=speaker, speed=speed, **kwargs)
225
+
226
+ # Save audio
227
+ import soundfile as sf
228
+ output_path = Path(output_path)
229
+ output_path.parent.mkdir(parents=True, exist_ok=True)
230
+ sf.write(str(output_path), audio, sr)
231
+ print(f"Audio saved to: {output_path}")
232
+
233
+ # Optionally play audio
234
+ if play:
235
+ try:
236
+ import sounddevice as sd
237
+ sd.play(audio, sr)
238
+ sd.wait()
239
+ except ImportError:
240
+ print("Install sounddevice to play audio: pip install sounddevice")
241
+
242
+ return str(output_path)
243
+
244
+ def list_speakers(self) -> list:
245
+ """Get list of available speakers."""
246
+ return self.speakers
247
+
248
+ def __repr__(self) -> str:
249
+ return f"TTS(device='{self.device}', speakers={self.speakers})"