smartwang commited on
Commit
6438a49
·
1 Parent(s): a9ab89b
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -14,7 +14,7 @@ from huggingface_hub import snapshot_download, login
14
  from qwen_tts import Qwen3TTSModel
15
  import functools
16
  import uuid
17
-
18
  # 配置日志
19
  logging.basicConfig(
20
  level=logging.INFO,
@@ -42,7 +42,14 @@ SPEAKERS = [
42
  "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
43
  ]
44
  LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
45
-
 
 
 
 
 
 
 
46
 
47
  def get_model_path(model_type: str, model_size: str) -> str:
48
  """Get model path based on type and size."""
@@ -211,7 +218,7 @@ def split_text(text, max_len=100):
211
  def infer_voice_design(part, language, voice_description):
212
  """Single segment inference for Voice Design."""
213
  voice_design_model = load_model("VoiceDesign","1.7B")
214
-
215
  wavs, sr = voice_design_model.generate_voice_design(
216
  text=part,
217
  language=language,
@@ -227,6 +234,7 @@ def infer_voice_design(part, language, voice_description):
227
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
228
  """Single segment inference for Voice Clone using reference audio."""
229
  # tts = BASE_MODELS[model_size]
 
230
  tts = load_model("Base", "0.6B")
231
  voice_clone_prompt = tts.create_voice_clone_prompt(
232
  ref_audio=audio_tuple,
@@ -238,12 +246,17 @@ def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
238
  language=language,
239
  voice_clone_prompt=voice_clone_prompt,
240
  max_new_tokens=2048,
 
 
 
 
241
  )
242
  return wavs[0], sr
243
 
244
  @spaces.GPU
245
  def infer_voice_clone_from_prompt(part, language, prompt_file_path):
246
  """Single segment inference for Voice Clone using pre-extracted prompt."""
 
247
  logger.info("正在加载音频特征文件...")
248
  voice_clone_prompt = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
249
  logger.info("音频特征文件加载成功。")
@@ -254,6 +267,10 @@ def infer_voice_clone_from_prompt(part, language, prompt_file_path):
254
  language=language,
255
  voice_clone_prompt=voice_clone_prompt,
256
  max_new_tokens=2048,
 
 
 
 
257
  )
258
  return wavs[0], sr
259
 
@@ -261,7 +278,7 @@ def infer_voice_clone_from_prompt(part, language, prompt_file_path):
261
  def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
262
  logger.info("正在提取参考音频特征(仅执行一次)...")
263
  tts = load_model("Base", "0.6B")
264
-
265
  audio_tuple = _audio_to_tuple(ref_audio)
266
  if audio_tuple is None:
267
  return None, "错误:需要参考音频。"
 
14
  from qwen_tts import Qwen3TTSModel
15
  import functools
16
  import uuid
17
+ import random
18
  # 配置日志
19
  logging.basicConfig(
20
  level=logging.INFO,
 
42
  "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
43
  ]
44
  LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
45
+ def seed_everything(seed=42):
46
+ random.seed(seed)
47
+ np.random.seed(seed)
48
+ torch.manual_seed(seed)
49
+ torch.cuda.manual_seed(seed)
50
+ torch.cuda.manual_seed_all(seed)
51
+ torch.backends.cudnn.deterministic = True
52
+ torch.backends.cudnn.benchmark = False
53
 
54
  def get_model_path(model_type: str, model_size: str) -> str:
55
  """Get model path based on type and size."""
 
218
  def infer_voice_design(part, language, voice_description):
219
  """Single segment inference for Voice Design."""
220
  voice_design_model = load_model("VoiceDesign","1.7B")
221
+ seed_everything(42)
222
  wavs, sr = voice_design_model.generate_voice_design(
223
  text=part,
224
  language=language,
 
234
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
235
  """Single segment inference for Voice Clone using reference audio."""
236
  # tts = BASE_MODELS[model_size]
237
+ seed_everything(42)
238
  tts = load_model("Base", "0.6B")
239
  voice_clone_prompt = tts.create_voice_clone_prompt(
240
  ref_audio=audio_tuple,
 
246
  language=language,
247
  voice_clone_prompt=voice_clone_prompt,
248
  max_new_tokens=2048,
249
+ # 核心参数:固定 seed
250
+ seed=42,
251
+ temperature=0.3, # 配合低温度,音色会更稳
252
+ top_p=0.85
253
  )
254
  return wavs[0], sr
255
 
256
  @spaces.GPU
257
  def infer_voice_clone_from_prompt(part, language, prompt_file_path):
258
  """Single segment inference for Voice Clone using pre-extracted prompt."""
259
+ seed_everything(42)
260
  logger.info("正在加载音频特征文件...")
261
  voice_clone_prompt = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
262
  logger.info("音频特征文件加载成功。")
 
267
  language=language,
268
  voice_clone_prompt=voice_clone_prompt,
269
  max_new_tokens=2048,
270
+ # 核心参数:固定 seed
271
+ seed=42,
272
+ temperature=0.3, # 配合低温度,音色会更稳
273
+ top_p=0.85
274
  )
275
  return wavs[0], sr
276
 
 
278
  def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
279
  logger.info("正在提取参考音频特征(仅执行一次)...")
280
  tts = load_model("Base", "0.6B")
281
+ seed_everything(42)
282
  audio_tuple = _audio_to_tuple(ref_audio)
283
  if audio_tuple is None:
284
  return None, "错误:需要参考音频。"