Spaces:
Running on Zero
Running on Zero
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from huggingface_hub import snapshot_download, login
|
|
| 14 |
from qwen_tts import Qwen3TTSModel
|
| 15 |
import functools
|
| 16 |
import uuid
|
| 17 |
-
|
| 18 |
# 配置日志
|
| 19 |
logging.basicConfig(
|
| 20 |
level=logging.INFO,
|
|
@@ -42,7 +42,14 @@ SPEAKERS = [
|
|
| 42 |
"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
|
| 43 |
]
|
| 44 |
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
def get_model_path(model_type: str, model_size: str) -> str:
|
| 48 |
"""Get model path based on type and size."""
|
|
@@ -211,7 +218,7 @@ def split_text(text, max_len=100):
|
|
| 211 |
def infer_voice_design(part, language, voice_description):
|
| 212 |
"""Single segment inference for Voice Design."""
|
| 213 |
voice_design_model = load_model("VoiceDesign","1.7B")
|
| 214 |
-
|
| 215 |
wavs, sr = voice_design_model.generate_voice_design(
|
| 216 |
text=part,
|
| 217 |
language=language,
|
|
@@ -227,6 +234,7 @@ def infer_voice_design(part, language, voice_description):
|
|
| 227 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 228 |
"""Single segment inference for Voice Clone using reference audio."""
|
| 229 |
# tts = BASE_MODELS[model_size]
|
|
|
|
| 230 |
tts = load_model("Base", "0.6B")
|
| 231 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
| 232 |
ref_audio=audio_tuple,
|
|
@@ -238,12 +246,17 @@ def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
|
| 238 |
language=language,
|
| 239 |
voice_clone_prompt=voice_clone_prompt,
|
| 240 |
max_new_tokens=2048,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
return wavs[0], sr
|
| 243 |
|
| 244 |
@spaces.GPU
|
| 245 |
def infer_voice_clone_from_prompt(part, language, prompt_file_path):
|
| 246 |
"""Single segment inference for Voice Clone using pre-extracted prompt."""
|
|
|
|
| 247 |
logger.info("正在加载音频特征文件...")
|
| 248 |
voice_clone_prompt = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
|
| 249 |
logger.info("音频特征文件加载成功。")
|
|
@@ -254,6 +267,10 @@ def infer_voice_clone_from_prompt(part, language, prompt_file_path):
|
|
| 254 |
language=language,
|
| 255 |
voice_clone_prompt=voice_clone_prompt,
|
| 256 |
max_new_tokens=2048,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
)
|
| 258 |
return wavs[0], sr
|
| 259 |
|
|
@@ -261,7 +278,7 @@ def infer_voice_clone_from_prompt(part, language, prompt_file_path):
|
|
| 261 |
def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
|
| 262 |
logger.info("正在提取参考音频特征(仅执行一次)...")
|
| 263 |
tts = load_model("Base", "0.6B")
|
| 264 |
-
|
| 265 |
audio_tuple = _audio_to_tuple(ref_audio)
|
| 266 |
if audio_tuple is None:
|
| 267 |
return None, "错误:需要参考音频。"
|
|
|
|
| 14 |
from qwen_tts import Qwen3TTSModel
|
| 15 |
import functools
|
| 16 |
import uuid
|
| 17 |
+
import random
|
| 18 |
# 配置日志
|
| 19 |
logging.basicConfig(
|
| 20 |
level=logging.INFO,
|
|
|
|
| 42 |
"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
|
| 43 |
]
|
| 44 |
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
|
| 45 |
+
def seed_everything(seed=42):
|
| 46 |
+
random.seed(seed)
|
| 47 |
+
np.random.seed(seed)
|
| 48 |
+
torch.manual_seed(seed)
|
| 49 |
+
torch.cuda.manual_seed(seed)
|
| 50 |
+
torch.cuda.manual_seed_all(seed)
|
| 51 |
+
torch.backends.cudnn.deterministic = True
|
| 52 |
+
torch.backends.cudnn.benchmark = False
|
| 53 |
|
| 54 |
def get_model_path(model_type: str, model_size: str) -> str:
|
| 55 |
"""Get model path based on type and size."""
|
|
|
|
| 218 |
def infer_voice_design(part, language, voice_description):
|
| 219 |
"""Single segment inference for Voice Design."""
|
| 220 |
voice_design_model = load_model("VoiceDesign","1.7B")
|
| 221 |
+
seed_everything(42)
|
| 222 |
wavs, sr = voice_design_model.generate_voice_design(
|
| 223 |
text=part,
|
| 224 |
language=language,
|
|
|
|
| 234 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 235 |
"""Single segment inference for Voice Clone using reference audio."""
|
| 236 |
# tts = BASE_MODELS[model_size]
|
| 237 |
+
seed_everything(42)
|
| 238 |
tts = load_model("Base", "0.6B")
|
| 239 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
| 240 |
ref_audio=audio_tuple,
|
|
|
|
| 246 |
language=language,
|
| 247 |
voice_clone_prompt=voice_clone_prompt,
|
| 248 |
max_new_tokens=2048,
|
| 249 |
+
# 核心参数:固定 seed
|
| 250 |
+
seed=42,
|
| 251 |
+
temperature=0.3, # 配合低温度,音色会更稳
|
| 252 |
+
top_p=0.85
|
| 253 |
)
|
| 254 |
return wavs[0], sr
|
| 255 |
|
| 256 |
@spaces.GPU
|
| 257 |
def infer_voice_clone_from_prompt(part, language, prompt_file_path):
|
| 258 |
"""Single segment inference for Voice Clone using pre-extracted prompt."""
|
| 259 |
+
seed_everything(42)
|
| 260 |
logger.info("正在加载音频特征文件...")
|
| 261 |
voice_clone_prompt = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
|
| 262 |
logger.info("音频特征文件加载成功。")
|
|
|
|
| 267 |
language=language,
|
| 268 |
voice_clone_prompt=voice_clone_prompt,
|
| 269 |
max_new_tokens=2048,
|
| 270 |
+
# 核心参数:固定 seed
|
| 271 |
+
seed=42,
|
| 272 |
+
temperature=0.3, # 配合低温度,音色会更稳
|
| 273 |
+
top_p=0.85
|
| 274 |
)
|
| 275 |
return wavs[0], sr
|
| 276 |
|
|
|
|
| 278 |
def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
|
| 279 |
logger.info("正在提取参考音频特征(仅执行一次)...")
|
| 280 |
tts = load_model("Base", "0.6B")
|
| 281 |
+
seed_everything(42)
|
| 282 |
audio_tuple = _audio_to_tuple(ref_audio)
|
| 283 |
if audio_tuple is None:
|
| 284 |
return None, "错误:需要参考音频。"
|